In [1]:
# Install required libraries
!pip install imbalanced-learn --quiet

# --- IMPORTS ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from google.colab import files

# --- LOAD DATA ---
uploaded = files.upload()  # Upload your CSV file
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)
print("Data loaded successfully!")
print(df.head())

# --- CREATE TARGET COLUMN ---
# Example: classify customers based on total sales
customer_sales = df.groupby('Customer ID')['Sales'].sum().reset_index()
customer_sales['Customer_Value'] = pd.qcut(customer_sales['Sales'], q=3, labels=['Low','Medium','High'])
df = df.merge(customer_sales[['Customer ID', 'Customer_Value']], on='Customer ID', how='left')

# --- FEATURES & TARGET ---
target_col = 'Customer_Value'
# Drop irrelevant columns to avoid leakage
drop_cols = ['Customer ID','Customer Name','Order ID','Row ID','Product ID','Product Name','Order Date','Ship Date']
X = df.drop(columns=drop_cols + [target_col])
y = df[target_col]

# Convert categorical features to dummy variables
X = pd.get_dummies(X, drop_first=True)

# --- HANDLE MISSING VALUES ---
imputer = SimpleImputer(strategy='median')  # Use 'mean' or 'most_frequent' if you prefer
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# --- SPLIT DATASET ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- SCALE NUMERIC FEATURES ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- HANDLE IMBALANCE WITH SMOTE ---
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Preprocessing, missing value handling, and SMOTE completed!")
print("Original training set shape:", X_train.shape, y_train.shape)
print("Resampled training set shape:", X_train_res.shape, y_train_res.shape)


Saving train.csv to train.csv
Data loaded successfully!
   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
1       2  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
2       3  CA-2017-138688  12/06/2017  16/06/2017    Second Class    DV-13045   
3       4  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   
4       5  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   

     Customer Name    Segment        Country             City       State  \
0      Claire Gute   Consumer  United States        Henderson    Kentucky   
1      Claire Gute   Consumer  United States        Henderson    Kentucky   
2  Darrin Van Huff  Corporate  United States      Los Angeles  California   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florid

In [2]:
# === Random Forest Training and Evaluation ===
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from tqdm.notebook import tqdm
import joblib
import numpy as np

# --- Optional: progress display for GridSearch ---
class TQDMSearchCV(GridSearchCV):
    def fit(self, X, y, **fit_params):
        n_candidates = len(self.param_grid[list(self.param_grid.keys())[0]])
        with tqdm(total=n_candidates, desc="GridSearch Progress", unit="param") as pbar:
            for i, params in enumerate(ParameterGrid(self.param_grid)):
                self.estimator.set_params(**params)
                super(TQDMSearchCV, self).fit(X, y, **fit_params)
                pbar.update(1)
        return self

# --- Hyperparameter grid ---
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced']
}

# --- Random Forest model ---
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# --- GridSearch with tqdm progress bar ---
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2
)

# --- Train model ---
print("Training Random Forest (this might take a few minutes)...")
grid_search.fit(X_train_res, y_train_res)

# --- Best model ---
best_rf = grid_search.best_estimator_
print("\nBest Hyperparameters:", grid_search.best_params_)

# --- Evaluate on test set ---
y_pred_rf = best_rf.predict(X_test)
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
print(f"\nF1-Score (weighted): {f1_rf:.4f}\n")

print("=== Random Forest Evaluation ===")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# --- Save model ---
joblib.dump(best_rf, 'random_forest_customer_value.pkl')
print("\nRandom Forest model saved successfully!")


Training Random Forest (this might take a few minutes)...
Fitting 3 folds for each of 216 candidates, totalling 648 fits

Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

F1-Score (weighted): 0.4997

=== Random Forest Evaluation ===
              precision    recall  f1-score   support

        High       0.56      0.66      0.60       888
         Low       0.39      0.29      0.33       401
      Medium       0.49      0.44      0.46       671

    accuracy                           0.51      1960
   macro avg       0.48      0.46      0.47      1960
weighted avg       0.50      0.51      0.50      1960


Confusion Matrix:
 [[586  96 206]
 [177 116 108]
 [291  84 296]]

Random Forest model saved successfully!


In [3]:
import pickle
from google.colab import files

# --- Save trained model ---
with open("random_forest_customer_value.pkl", "wb") as f:
    pickle.dump(best_rf, f)

print("✅ Random Forest model saved as random_forest_customer_value.pkl")

# --- Download the pickle file ---
files.download("random_forest_customer_value.pkl")


✅ Random Forest model saved as random_forest_customer_value.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>