In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DRIVE_PATH = '/content/drive/MyDrive/train_data.csv'
TARGET_COLUMN = None
TEST_SIZE = 0.2
RANDOM_SEED = 42
SAVE_DIR = '/content/drive/MyDrive/model_output'

import os
os.makedirs(SAVE_DIR, exist_ok=True)
print("SAVE_DIR:", SAVE_DIR)


SAVE_DIR: /content/drive/MyDrive/model_output


In [4]:
import os, json, joblib, random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, accuracy_score

random.seed(RANDOM_SEED)

In [5]:
from pathlib import Path
p = Path(DRIVE_PATH)
print('Trying to load:', DRIVE_PATH)
if not p.exists():
    print('ERROR: file not found at that path ->', DRIVE_PATH)
    parent = p.parent
    print('Files in directory:', parent)
    try:
        for i, f in enumerate(sorted(parent.glob('*'))):
            if i >= 200: break
            print('-', f.name)
    except Exception as e:
        print('Could not list folder contents:', e)
    raise FileNotFoundError(f'File not found: {DRIVE_PATH}')

# read file robustly
ext = p.suffix.lower()
try:
    if ext in ('.xls', '.xlsx'):
        print('Detected Excel file. Using read_excel()')
        df = pd.read_excel(DRIVE_PATH)
    else:
        try:
            df = pd.read_csv(DRIVE_PATH)
        except UnicodeDecodeError:
            print('UnicodeDecodeError -> retrying with latin1')
            df = pd.read_csv(DRIVE_PATH, encoding='latin1')
        except pd.errors.ParserError:
            print('ParserError -> retrying with engine=python, sep=None')
            df = pd.read_csv(DRIVE_PATH, engine='python', sep=None)
except Exception as e:
    print('Failed to read file:', type(e).__name__, e)
    raise

print('\nLoaded. Shape:', df.shape)
print('\nColumns:')
for i,c in enumerate(df.columns[:200], 1): print(f'  {i}. {c}')
print('\nFirst 5 rows:')
display(df.head())
print('\nDtypes:')
print(df.dtypes)
print('\nTop missing counts:')
print(df.isnull().sum().sort_values(ascending=False).head(20))

# target selection logic
try:
    TARGET_COLUMN
except NameError:
    TARGET_COLUMN = None

chosen = None
if TARGET_COLUMN and TARGET_COLUMN in df.columns:
    chosen = TARGET_COLUMN
    print('\nUsing user-specified TARGET_COLUMN:', chosen)
else:
    common = ['target','label','y','price','class','Outcome','outcome']
    for cand in common:
        if cand in df.columns:
            chosen = cand
            print('\nAuto-detected target column:', cand)
            break
    if chosen is None:
        chosen = df.columns[-1]
        print('\nDefaulting to last column as target:', chosen, '(change TARGET_COLUMN if incorrect)')

# assign X,y and try convert numeric-like target stored as text
y = df[chosen]
X = df.drop(columns=[chosen])
print('\nSelected target:', chosen, '  dtype:', y.dtype, '  shape:', y.shape)
print('Sample target value counts (up to 20):')
print(y.value_counts(dropna=False).head(20))

if not pd.api.types.is_numeric_dtype(y):
    y_conv = pd.to_numeric(y, errors='coerce')
    non_na_frac = y_conv.notna().mean()
    if non_na_frac > 0.9:
        print('\nTarget looks numeric as text. Converting to numeric (coerce NAs).')
        y = y_conv
        df[chosen] = y
    else:
        print('\nTarget appears non-numeric -> classification assumed.')

# expose to globals for next cells
globals().update({'df': df, 'X': X, 'y': y, 'TARGET_COLUMN': chosen})
print('\nDone. Variables available: df, X, y, TARGET_COLUMN')


Trying to load: /content/drive/MyDrive/train_data.csv

Loaded. Shape: (792, 17)

Columns:
  1. Unnamed: 0
  2. PassengerId
  3. Survived
  4. Sex
  5. Age
  6. Fare
  7. Pclass_1
  8. Pclass_2
  9. Pclass_3
  10. Family_size
  11. Title_1
  12. Title_2
  13. Title_3
  14. Title_4
  15. Emb_1
  16. Emb_2
  17. Emb_3

First 5 rows:


Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Family_size,Title_1,Title_2,Title_3,Title_4,Emb_1,Emb_2,Emb_3
0,0,1,0,1,0.275,0.014151,0,0,1,0.1,1,0,0,0,0,0,1
1,1,2,1,0,0.475,0.139136,1,0,0,0.1,1,0,0,0,1,0,0
2,2,3,1,0,0.325,0.015469,0,0,1,0.0,0,0,0,1,0,0,1
3,3,4,1,0,0.4375,0.103644,1,0,0,0.1,1,0,0,0,0,0,1
4,4,5,0,1,0.4375,0.015713,0,0,1,0.0,1,0,0,0,0,0,1



Dtypes:
Unnamed: 0       int64
PassengerId      int64
Survived         int64
Sex              int64
Age            float64
Fare           float64
Pclass_1         int64
Pclass_2         int64
Pclass_3         int64
Family_size    float64
Title_1          int64
Title_2          int64
Title_3          int64
Title_4          int64
Emb_1            int64
Emb_2            int64
Emb_3            int64
dtype: object

Top missing counts:
Unnamed: 0     0
PassengerId    0
Survived       0
Sex            0
Age            0
Fare           0
Pclass_1       0
Pclass_2       0
Pclass_3       0
Family_size    0
Title_1        0
Title_2        0
Title_3        0
Title_4        0
Emb_1          0
Emb_2          0
Emb_3          0
dtype: int64

Defaulting to last column as target: Emb_3 (change TARGET_COLUMN if incorrect)

Selected target: Emb_3   dtype: int64   shape: (792,)
Sample target value counts (up to 20):
Emb_3
1    571
0    221
Name: count, dtype: int64

Done. Variables available: df, X, y, T

In [6]:
# infer task
if pd.api.types.is_numeric_dtype(y):
    uniq = int(y.nunique(dropna=True))
    task = 'regression' if uniq > 20 else 'classification'
else:
    task = 'classification'
print('Inferred task:', task)

# split (stratify for classification)
if task == 'classification':
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

print('Train:', X_train.shape, 'Test:', X_test.shape)

Inferred task: classification
Train: (633, 16) Test: (159, 16)


In [7]:
numeric_feats = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_feats = X_train.select_dtypes(include=['object','category','bool']).columns.tolist()

transformers = []
if numeric_feats:
    transformers.append(('num', StandardScaler(), numeric_feats))
if categorical_feats:
    # if memory is tight, set sparse=True
    transformers.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_feats))

if not transformers:
    preprocessor = 'passthrough'
else:
    preprocessor = ColumnTransformer(transformers=transformers)

print('Numeric:', numeric_feats)
print('Categorical:', categorical_feats)


Numeric: ['Unnamed: 0', 'PassengerId', 'Survived', 'Sex', 'Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Family_size', 'Title_1', 'Title_2', 'Title_3', 'Title_4', 'Emb_1', 'Emb_2']
Categorical: []


In [8]:
if task == 'classification':
    base_model = RandomForestClassifier(random_state=RANDOM_SEED)
else:
    base_model = RandomForestRegressor(random_state=RANDOM_SEED)

steps = []
if preprocessor != 'passthrough':
    steps.append(('preprocess', preprocessor))
steps.append(('model', base_model))
pipe = Pipeline(steps)

print('Training baseline model (this can take time)...')
pipe.fit(X_train, y_train)
print('Baseline trained.')

Training baseline model (this can take time)...
Baseline trained.


In [9]:
if task == 'regression':
    preds = pipe.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, preds)
    print(f'Baseline MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}')
    plt.figure(figsize=(6,6))
    plt.scatter(y_test, preds, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--')
    plt.xlabel('Actual'); plt.ylabel('Predicted'); plt.show()
else:
    preds = pipe.predict(X_test)
    print('Baseline classification report:')
    print(classification_report(y_test, preds, zero_division=0))
    print('Confusion matrix:')
    print(confusion_matrix(y_test, preds))


Baseline classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        44
           1       1.00      1.00      1.00       115

    accuracy                           1.00       159
   macro avg       1.00      1.00      1.00       159
weighted avg       1.00      1.00      1.00       159

Confusion matrix:
[[ 44   0]
 [  0 115]]


In [10]:
param_dist = {
    'model__n_estimators': [100,200,500],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2,5,10],
    'model__min_samples_leaf': [1,2,4]
}
scoring = 'f1_weighted' if task == 'classification' else 'neg_mean_squared_error'

search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=20, cv=5,
                            scoring=scoring, random_state=RANDOM_SEED, n_jobs=-1, verbose=1)
print('Running search (this may take several minutes)...')
search.fit(X_train, y_train)
print('Best params:', search.best_params_)

Running search (this may take several minutes)...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params: {'model__n_estimators': 500, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_depth': 20}


In [11]:
best = search.best_estimator_
if task == 'regression':
    preds = best.predict(X_test)
    mse = mean_squared_error(y_test, preds); rmse = mse**0.5; r2 = r2_score(y_test, preds)
    print(f'Tuned MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}')
    plt.figure(figsize=(6,6)); plt.scatter(y_test,preds,alpha=0.6); plt.plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()],'--'); plt.show()
else:
    preds = best.predict(X_test)
    print('Tuned classification report:')
    print(classification_report(y_test, preds, zero_division=0))
    print('Confusion matrix:')
    print(confusion_matrix(y_test, preds))

Tuned classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        44
           1       1.00      1.00      1.00       115

    accuracy                           1.00       159
   macro avg       1.00      1.00      1.00       159
weighted avg       1.00      1.00      1.00       159

Confusion matrix:
[[ 44   0]
 [  0 115]]


In [12]:
model_path = Path(SAVE_DIR) / 'best_model.joblib'
joblib.dump(best, model_path)
print('Saved model to', model_path)

summary = {
    'task': task,
    'target_column': TARGET_COLUMN,
    'best_params': search.best_params_
}
with open(Path(SAVE_DIR) / 'training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
print('Saved training summary to', SAVE_DIR)

Saved model to /content/drive/MyDrive/model_output/best_model.joblib
Saved training summary to /content/drive/MyDrive/model_output
