In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import catboost as cb

In [81]:
df = pd.read_csv('feature.csv')

target = 'target'
X = df.drop(columns=['id', target])
y = df[target]


In [82]:
binary_features = [col for col in X.columns if 'bin_' in col]
ordinal_features = [col for col in X.columns if 'ord_' in col]
nominal_features = [col for col in X.columns if 'nom_' in col]


In [83]:
for col in ordinal_features:
    X[col] = LabelEncoder().fit_transform(X[col])

In [84]:
low_cardinality_nominals = [col for col in nominal_features if X[col].nunique() < 10]
high_cardinality_nominals = [col for col in nominal_features if X[col].nunique() >= 10]

ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
X_ohe = pd.DataFrame(ohe.fit_transform(X[low_cardinality_nominals]), columns=ohe.get_feature_names_out())
X = X.drop(columns=low_cardinality_nominals).reset_index(drop=True)
X = pd.concat([X, X_ohe], axis=1)


In [85]:
te = TargetEncoder()
X[high_cardinality_nominals] = te.fit_transform(X[high_cardinality_nominals], y)

In [86]:
for col in high_cardinality_nominals:
    freq_map = X[col].value_counts() / len(X)
    X[col] = X[col].map(freq_map)

In [87]:
X['ord_sum'] = X[ordinal_features].sum(axis=1)


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [89]:
print(X_train.dtypes)


bin_0               float64
bin_1               float64
bin_2               float64
bin_3                object
bin_4                object
nom_5               float64
nom_6               float64
nom_7               float64
nom_8               float64
nom_9               float64
ord_0                 int64
ord_1                 int64
ord_2                 int64
ord_3                 int64
ord_4                 int64
ord_5                 int64
day                 float64
month               float64
nom_0_Green         float64
nom_0_Red           float64
nom_0_nan           float64
nom_1_Polygon       float64
nom_1_Square        float64
nom_1_Star          float64
nom_1_Trapezoid     float64
nom_1_Triangle      float64
nom_1_nan           float64
nom_2_Cat           float64
nom_2_Dog           float64
nom_2_Hamster       float64
nom_2_Lion          float64
nom_2_Snake         float64
nom_2_nan           float64
nom_3_China         float64
nom_3_Costa Rica    float64
nom_3_Finland       

In [90]:
for col in ['bin_3', 'bin_4']:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [91]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [92]:
print(np.isnan(X_train).sum())  
print(np.isnan(X_test).sum())


71739
18028


In [93]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [98]:
if isinstance(X_train, np.ndarray):
    X_train = pd.DataFrame(X_train, columns=[f'Feature_{i}' for i in range(X_train.shape[1])])

feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

print(feature_importance.head(10))

       Feature  Importance
43  Feature_43    0.073365
6    Feature_6    0.070799
5    Feature_5    0.070771
9    Feature_9    0.070501
7    Feature_7    0.069115
8    Feature_8    0.068934
15  Feature_15    0.067049
13  Feature_13    0.065792
14  Feature_14    0.053132
17  Feature_17    0.045333


In [17]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.4f}')

Random Forest Accuracy: 0.8154


In [18]:
rf_grid_params = {
    'n_estimators': [100, 200],  
    'max_depth': [10, 20], 
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2],  
    'bootstrap': [True]  
}


grid_search = GridSearchCV(estimator=rf, param_grid=rf_grid_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f'Best Parameters for Random Forest (Grid Search): {grid_search.best_params_}')





Best Parameters for Random Forest (Grid Search): {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [21]:
rf_random_params = {
    'n_estimators': [100, 300, 500], 
    'max_depth': [None, 10, 30],  
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2],  
    'bootstrap': [True]  
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=rf_random_params, 
                                   n_iter=10, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)

random_search.fit(X_train, y_train)

print(f'Best Parameters for Random Forest (Random Search): {random_search.best_params_}')





Best Parameters for Random Forest (Random Search): {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': True}


In [36]:
##Task 2 

In [59]:
df1 = pd.read_csv("train-2.csv")

In [60]:
df1

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...,...
193568,193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [61]:
target = 'price'
X = df1.drop(columns=['id', target])
y = df1[target]

In [62]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

In [63]:
te = TargetEncoder()
X[categorical_features] = te.fit_transform(X[categorical_features], y)

In [64]:
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

In [65]:
X['feature_sum'] = X.sum(axis=1)
X['feature_product'] = np.clip(X.prod(axis=1), -1e9, 1e9)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [67]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(eval_metric='rmse', random_state=42)
xgb_model.fit(X_train, y_train)

In [69]:
y_pred = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Baseline XGBoost RMSE: {rmse:.4f}')

Baseline XGBoost RMSE: 588.7087


In [78]:
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': xgb_model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance.head(10))  


       Feature  Importance
7            y    0.684161
0        carat    0.233063
3      clarity    0.029507
9  feature_sum    0.021316
2        color    0.010156
6            x    0.009890
8            z    0.005953
1          cut    0.003968
5        table    0.000731
4        depth    0.000684


In [72]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 300, 500],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search = GridSearchCV(
    XGBRegressor(), param_grid, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f'Best Grid Search Params: {grid_search.best_params_}')


Best Grid Search Params: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500}


In [73]:
param_dist = {
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'max_depth': np.arange(3, 10, 1),
    'n_estimators': np.arange(100, 1000, 100),
    'subsample': np.linspace(0.5, 1.0, 10),
    'colsample_bytree': np.linspace(0.5, 1.0, 10)
}

random_search = RandomizedSearchCV(
    XGBRegressor(), 
    param_distributions=param_dist, 
    n_iter=20, 
    scoring='neg_root_mean_squared_error', 
    cv=3, 
    random_state=42, 
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print(f'Best Random Search Params: {random_search.best_params_}')




Best Random Search Params: {'subsample': 0.9444444444444444, 'n_estimators': 900, 'max_depth': 6, 'learning_rate': 0.01, 'colsample_bytree': 0.8333333333333333}


In [74]:
best_params = grid_search.best_params_

final_model = XGBRegressor(**best_params, random_state=42)
final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Final XGBoost RMSE: {rmse:.4f}')


Final XGBoost RMSE: 579.5349


In [75]:
nrmse = rmse / (y_test.max() - y_test.min())
print(f'Final XGBoost NRMSE: {nrmse:.4f}')


Final XGBoost NRMSE: 0.0313


In [None]:
#I converted the RMSE to NRMSE at the end because my target variables very relatively very high, 
#which could be attributed to why the rmse was also so high. A low NMRSE of 0.0313 is a good indication of 
#my model being able to perform well on the dataset. 