# Imports

In [2]:
import pandas as pd
import os
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


# Preprocessing

In [4]:
data = pd.read_csv('train.csv')

test = pd.read_csv('test.csv')

data = data[data['original'] == 1]
data = data.reset_index(drop=True)

test = test[test['original'] == 1]
test = test.reset_index(drop=True)

labels = data['timestamp(day)']
data = data.drop('timestamp(day)', axis=1)

test_labels = test['timestamp(day)']
test_data = test.drop('timestamp(day)', axis=1)


# Imputation(by mean)
imputer = SimpleImputer(strategy='mean')
imputed_data = imputer.fit_transform(data)
data = pd.DataFrame(imputed_data, columns=data.columns)

imputer = SimpleImputer(strategy='mean')
imputed_test_data = imputer.fit_transform(test_data)
test_data = pd.DataFrame(imputed_test_data, columns=test_data.columns)

#SMOTE
smote = SMOTE()
balanced_data, balanced_labels = smote.fit_resample(data, labels)

data = pd.DataFrame(balanced_data, columns=data.columns)
labels = balanced_labels

smote = SMOTE()
balanced_test_data, balanced_test_labels = smote.fit_resample(test_data, test_labels)

test_data = pd.DataFrame(balanced_test_data, columns=test_data.columns)
test_labels = balanced_test_labels


# EDA


In [5]:
data_scaled = preprocessing.scale(data)
pca = PCA()
pca.fit(data_scaled)

explained_variance_ratio = pca.explained_variance_ratio_

n_components = 32
data_transformed = pca.transform(data_scaled)[:, :n_components]
pca_data = pd.DataFrame(data_transformed)
data = pca_data

test_data_scaled = preprocessing.scale(test_data)
pca = PCA()
pca.fit(test_data_scaled)

explained_variance_ratio = pca.explained_variance_ratio_

n_components = 32
test_data_transformed = pca.transform(test_data_scaled)[:, :n_components]
pca_test_data = pd.DataFrame(test_data_transformed)
test_data = pca_test_data

# Model Selection



In [6]:
model = RandomForestClassifier()

# Model Tuning



In [7]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, scoring='f1_weighted', cv=2)
grid_search.fit(data, labels)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_model = RandomForestClassifier(**best_params)
best_model.fit(data, labels)

# Model Evaluation / Metrics

In [8]:
pred = best_model.predict(test_data)

accuracy = accuracy_score(test_labels, pred)
f1 = f1_score(test_labels, pred, average='weighted')
precision = precision_score(test_labels, pred, average='weighted')
recall = recall_score(test_labels, pred, average='weighted')

print('Accuracy:', accuracy)
print('F1-Score:', f1)
print('Precision:', precision)
print('Recall:', recall)

Accuracy: 0.13643420882390994
F1-Score: 0.10615938667400861
Precision: 0.10495356889129541
Recall: 0.13643420882390994
