In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import itertools
import scipy
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Supress scikit-learn's deprecations warnings.

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
greeks = pd.read_csv("greeks.csv")
test_data = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
train.columns = train.columns.str.replace(' ', '')
test_data.columns = test_data.columns.str.replace(' ', '')
greeks.columns = greeks.columns.str.replace(' ', '')

In [4]:
data = pd.merge(train, greeks[['Id','Beta','Gamma','Delta']], on='Id', how='left')

In [5]:
#Beta_dummies = pd.get_dummies(data['Beta'],prefix='Beta')
#Gamma_dummies = pd.get_dummies(data['Gamma'],prefix='Gamma')
#Delta_dummies = pd.get_dummies(data['Delta'],prefix='Delta')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 617 entries, 0 to 616
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      617 non-null    object 
 1   AB      617 non-null    float64
 2   AF      617 non-null    float64
 3   AH      617 non-null    float64
 4   AM      617 non-null    float64
 5   AR      617 non-null    float64
 6   AX      617 non-null    float64
 7   AY      617 non-null    float64
 8   AZ      617 non-null    float64
 9   BC      617 non-null    float64
 10  BD      617 non-null    float64
 11  BN      617 non-null    float64
 12  BP      617 non-null    float64
 13  BQ      557 non-null    float64
 14  BR      617 non-null    float64
 15  BZ      617 non-null    float64
 16  CB      615 non-null    float64
 17  CC      614 non-null    float64
 18  CD      617 non-null    float64
 19  CF      617 non-null    float64
 20  CH      617 non-null    float64
 21  CL      617 non-null    float64
 22  CR

In [7]:
# Visualize the data in line chart
feature_names = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BN', 'BQ', 'BR', 'BZ', 'CB', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EP', 'EU', 'FE', 'FI', 'FR', 'GB', 'GE', 'GF', 'GH', 'GI']
# feature_names = ['AB']

def lineChart(feature_name):
    feature_data = data[feature_name].to_numpy()
    plt.plot(feature_data)
    plt.show()

# remove the outliers using IQR logic
def removeOutliers(feature_data):
    # print(feature_data)
    mean = np.mean(feature_data)
    print(mean)
    q1, q3 = np.percentile(feature_data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    outliers = [x for x in feature_data if x < lower_bound or x > upper_bound]
    feature_data_replaced = [mean if x < lower_bound or x > upper_bound else x for x in feature_data]
    print("The count of outliers are:", len(outliers))
    # print("The data with replaced outliers is:", feature_data_replaced)
    return feature_data_replaced

for feature_name in feature_names:
    # lineChart(feature_name)
    feature_data = data[feature_name]
    feature_data_new = removeOutliers(feature_data)
    # print(len(feature_data), len(feature_data_new))
    data[feature_name] = feature_data_new

0.47714935818476445
The count of outliers are: 35
3502.0132207455463
The count of outliers are: 19
118.62451267179958
The count of outliers are: 70
38.96855212965967
The count of outliers are: 54
10.128242051863856
The count of outliers are: 133
5.545576059967586
The count of outliers are: 27
0.06031962398703415
The count of outliers are: 107
10.566446961102116
The count of outliers are: 14
8.053011588330667
The count of outliers are: 45
21.419492382495925
The count of outliers are: 14
98.32873688509875
The count of outliers are: 0
1218.133237713126
The count of outliers are: 47
550.6325253525081
The count of outliers are: 116
77.10415084878039
The count of outliers are: 0
11.24106389303078
The count of outliers are: 38
0.03061471474878444
The count of outliers are: 30
1.4037610332252755
The count of outliers are: 74
0.7422616288492706
The count of outliers are: 31
36.91758957131283
The count of outliers are: 31
1.3837923598055106
The count of outliers are: 21
51.12832583468397
The cou

In [8]:
#identify columns with NAs and the number of NAs
data.isnull().sum()

Id       0
AB       0
AF       0
AH       0
AM       0
        ..
GL       1
Class    0
Beta     3
Gamma    3
Delta    3
Length: 61, dtype: int64

In [9]:
# Select the features and target
features = [column for column in data.columns if column not in ['Id', 'Class']]
target = 'Class'

In [10]:
# Prepare the data - encode, remove inf, -inf and fill NaNs
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].astype('category').cat.codes

In [11]:
for column in features:
    data[column] = data[column].replace([np.inf, -np.inf], np.nan)
    data[column] = data[column].fillna(data[column].median())

In [12]:
# Use Random Forest to get feature importances
rf = RandomForestClassifier(random_state=42)
rf.fit(data[features], data[target])

RandomForestClassifier(random_state=42)

In [13]:
# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)

In [14]:
# Select top features
top_features = feature_importances['Feature'][:10]
top_features

57    Gamma
32       DU
21       CR
47       FL
25       DA
55       GL
44       FD
26       DE
12       BQ
36       EE
Name: Feature, dtype: object

In [15]:
# Select the features and target
features = [column for column in data.columns if column not in ['Id', 'Class','Gamma']]
target = 'Gamma'

for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].astype('category').cat.codes

for column in features:
    data[column] = data[column].replace([np.inf, -np.inf], np.nan)
    data[column] = data[column].fillna(data[column].median())

rf = RandomForestClassifier(random_state=42)
rf.fit(data[features], data[target])

# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)

# Select top features
Gamma_top_features = feature_importances['Feature'][:10]
Gamma_top_features

RandomForestClassifier(random_state=42)

32    DU
47    FL
55    GL
23    CU
21    CR
10    BN
44    FD
38    EH
12    BQ
25    DA
Name: Feature, dtype: object

In [16]:
features = pd.concat([data[top_features[1:]],data[Gamma_top_features]],axis=1)
features = features.T.drop_duplicates().T
features

Unnamed: 0,DU,CR,FL,DA,GL,FD,DE,BQ,EE,CU,BN,EH
0,5.310690,0.742262,7.298162,69.083400,0.120343,10.265073,295.570575,152.707705,1.987283,1.302012,22.5984,0.305107
1,0.005518,1.117800,0.173229,70.798360,21.978000,0.296850,178.553100,14.754720,0.858603,1.357182,19.4205,0.003042
2,1.289739,0.700350,7.709560,70.819700,0.196941,8.745201,321.426625,219.320160,3.064778,1.009611,26.4825,0.377208
3,2.655345,0.636075,6.122162,47.275860,0.155829,7.884336,196.607985,11.050410,3.813326,0.722727,23.6577,0.305107
4,1.144902,0.693150,8.153058,74.065320,0.096614,4.274640,200.178160,149.717165,3.490846,0.827550,24.0108,0.164268
...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.005518,0.698250,0.173229,21.759040,21.978000,0.296850,355.930925,27.287375,1.354416,1.070298,21.1860,0.003042
613,0.648318,0.761025,10.223150,43.909960,0.145340,6.067614,157.393715,344.644105,0.753797,2.146113,27.1887,0.139932
614,0.005518,0.879825,0.173229,51.128326,21.978000,0.296850,223.209115,103.988995,2.225112,1.489590,20.4798,0.003042
615,0.510378,0.583125,9.256996,51.041400,0.184622,6.192291,112.196630,61.642115,1.628524,1.428903,19.0674,0.139932


In [17]:
# Train/Test Split
X = features
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
# Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 10, 15],
    'n_estimators': [50,100,200,300],
    'min_samples_leaf': [1,2,3,4,5,6,7],
}
gridSearch = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(X_train_scaled, y_train)
print('Initial score: ', gridSearch.best_score_)
print('Initial parameters: ', gridSearch.best_params_)

bestRFC = gridSearch.best_estimator_

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 10, 15],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7],
                         'n_estimators': [50, 100, 200, 300]})

Initial score:  0.8978882651697406
Initial parameters:  {'max_depth': 10, 'min_samples_leaf': 3, 'n_estimators': 100}


In [20]:
# Model Definition with RandomForest and Cross-validation
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=3, random_state=42)
scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='neg_log_loss')
log_loss_scores = -scores
print(f'Log Loss scores for each fold: {log_loss_scores}')
average_log_loss = np.mean(log_loss_scores)
print(f'Average Log Loss: {average_log_loss}')

Log Loss scores for each fold: [0.23715239 0.21253797 0.31581365 0.25946048 0.26699338]
Average Log Loss: 0.2583915748113523


In [21]:
# Fit the model with the training data
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(max_depth=10, min_samples_leaf=3, random_state=42)

In [22]:
# Prepare the test data
test_data_clean = test_data[features.columns]
test_data_clean = test_data_clean.replace([np.inf, -np.inf], np.nan)
test_data_clean.fillna(test_data_clean.median(numeric_only=True), inplace=True)
test_data_scaled = scaler.transform(test_data_clean)

In [23]:
# Predictions using the fitted model
test_preds = rf_model.predict_proba(test_data_scaled)

In [24]:
# Create a submission file
submission = pd.DataFrame(test_data['Id'], columns=['Id'])
submission[['class_0', 'class_1']] = test_preds
submission.to_csv('submission.csv', index=False)