<a href="https://colab.research.google.com/github/yangyadi/Case/blob/main/DataScience_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Objective: Classification

0. Import Libraries

In [None]:
# Import
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier  # optional

print("Setup complete.")

1. Load and explore data

In [None]:
# Load data
df = pd.read_csv('dataset.csv')
df.head()
df.info()
df.describe()


2. Data preprocessing

In [None]:
# Drop missing value
df = df.dropna()
df['target'] = df['your_target_column'].apply(lambda x: 1 if x > 15 else 0)

# Feature Engineering
df['hour'] = pd.to_datetime(df['scheduled_departure']).dt.hour
features = ['hour', 'distance', 'airline', 'origin', 'destination']

# Defining features and target

X = df[features]
y = df['target']
# or
X = df[['departure_hour', 'airline', 'origin', 'destination', 'distance']]
y = df['delayed'].apply(lambda x: 1 if x > 15 else 0)  # Delayed if >15 min

# Encoding for categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

3. Train-Test split or Cross Validation

In [None]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state = 5)

4. Pipeline + Model

In [None]:

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # optional for tree models
    ('clf', RandomForestClassifier(random_state = 6))
    # Or try XGBoost: replace with ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

5. Cross validation

In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipeline, X_encoded, y, cv=cv,
                        scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

6. Model evaluation

In [None]:

print("Accuracy:", scores['test_accuracy'].mean())
print("Precision:", scores['test_precision'].mean())
print("Recall:", scores['test_recall'].mean())
print("F1 Score:", scores['test_f1'].mean())
print("ROC AUC:", scores['test_roc_auc'].mean())




7. Fit final model for feature importance

In [None]:
pipeline.fit(X_encoded, y)
model = pipeline.named_steps['clf']
if hasattr(model, 'feature_importances_'):
    importances = pd.Series(model.feature_importances_, index=X_encoded.columns)
    print(importances.sort_values(ascending=False).head(10))
