# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report
from mlxtend.plotting import plot_confusion_matrix
from scikitplot.plotters import plot_roc_curve, plot_precision_recall_curve, plot_feature_importances
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
df = pd.read_csv("/mnt/hdd/Datasets/M1_data.csv", delimiter=",")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

# EDA

In [None]:
df.head()

In [None]:
df.trust_apple.value_counts().plot(kind="pie", autopct="%.2f%%", startangle=90, shadow=True, explode=[0, 0.1], title="Trust Apple")

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="interest_computers")
ax.bar_label(ax.containers[0])
plt.title("Interest Computers")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="age_computer")
ax.bar_label(ax.containers[0])
plt.title("Age Computer")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="user_pcmac")
ax.bar_label(ax.containers[0])
plt.title("PC vs Mac")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="appleproducts_count")
ax.bar_label(ax.containers[0])
plt.title("Apple Products Count")
plt.show()

In [None]:
df.familiarity_m1.value_counts().plot(kind="pie", autopct="%.2f%%", shadow=True, startangle=90, explode=[0, 0.1], title="Familiarity M1")

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_batterylife")
ax.bar_label(ax.containers[0])
plt.title("Battery Life")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_price")
ax.bar_label(ax.containers[0])
plt.title("Price")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_size")
ax.bar_label(ax.containers[0])
plt.title("Size")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_multitasking")
ax.bar_label(ax.containers[0])
plt.title("Multitasking")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_noise")
ax.bar_label(ax.containers[0])
plt.title("Noise")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_performance")
ax.bar_label(ax.containers[0])
plt.title("Performance")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_neural")
ax.bar_label(ax.containers[0])
plt.title("Neural")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_synergy")
ax.bar_label(ax.containers[0])
plt.title("Synergy")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="f_performanceloss")
ax.bar_label(ax.containers[0])
plt.title("Performance Loss")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="m1_consideration")
ax.bar_label(ax.containers[0])
plt.title("m1_consideration")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="m1_purchase")
ax.bar_label(ax.containers[0])
plt.title("m1_purchase")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="gender")
ax.bar_label(ax.containers[0])
plt.title("Gender")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="age_group")
ax.bar_label(ax.containers[0])
plt.title("Age Group")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="income_group")
ax.bar_label(ax.containers[0])
plt.title("Income Group")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="status")
ax.bar_label(ax.containers[0])
plt.title("Status")
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(14, 5))
ax = sns.countplot(data=df, x="domain")
ax.bar_label(ax.containers[0])
plt.title("domain")
plt.xticks(rotation=90)
plt.show()

# Preprocess

In [None]:
df["interest_computers"] = df["interest_computers"].map({5: "Good", 4: "Good", 3: "Bad", 2: "Bad"})

In [None]:
df["age_computer"] = df["age_computer"].map({
    0: "New", 1: "New", 2: "New", 
    3: "Old", 4: "Old", 5: "Old", 
    6: "Very Old", 7: "Very Old", 8: "Very Old", 9: "Very Old"
})

In [None]:
df["user_pcmac"] = df["user_pcmac"].map({
    "Apple": "Apple",
    "PC": "Other", "Hp": "Other", "Other": "Other"
})

In [None]:
df["f_batterylife"] = df["f_batterylife"].map({
    1: "Low", 2: "Low",
    3: "Normal", 4: "Normal",
    5: "High"
})

In [None]:
df["f_price"] = df["f_price"].map({
    1: "Low", 2: "Low",
    3: "Normal",
    4: "Normal-High",
    5: "High"
})

In [None]:
df["f_multitasking"] = df["f_multitasking"].map({
    2: "Low", 3: "Low",
    4: "Normal",
    5: "High"
})

In [None]:
df["f_noise"] = df["f_noise"].map({
    1: "Low", 2: "Low",
    3: "Low",
    4: "Normal",
    5: "High"
})

In [None]:
df["f_performance"] = df["f_performance"].map({
    2: "Low", 3: "Low",
    4: "Normal",
    5: "High"
})

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df["status"].unique()

In [None]:
cat_cols = [col for col in df.columns if df[col].dtype == "object"]
cat_cols.remove("m1_purchase")
cat_cols

In [None]:
df.loc[0].T

In [None]:
ord_encoder =  OrdinalEncoder()
df[cat_cols] = ord_encoder.fit_transform(df[cat_cols])

In [None]:
pickle.dump(ord_encoder, open("../base/server/model/ord_encoder.pkl", "wb"))

In [None]:
df.head()

In [None]:
class_names = ["No", "Yes"]

# Model

In [None]:
X = df.drop("m1_purchase", axis=1)
y = df["m1_purchase"]

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X_resampled)

In [None]:
pickle.dump(ss, open("../base/server/model/ss.pkl", "wb"))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Random Forest

In [None]:
rf = RandomForestClassifier()
start = time.time()
rf.fit(X_train, y_train)
end = time.time()
rf_time = end - start
print("Random Forest Train Time:", rf_time)

In [None]:
pickle.dump(rf, open("../base/server/model/rf.pkl", "wb"))

In [None]:
rf_scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=3)
print("Random Forest Cross-Validation Score:", rf_scores.mean())

In [None]:
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)
rf_test_proba = rf.predict_proba(X_test)

rf_train_score = accuracy_score(rf_pred_train, y_train)
rf_test_score = accuracy_score(rf_pred_test, y_test)
print("Random Forest Train Score:", rf_train_score)
print("Random Forest Test Score:", rf_test_score)

# Hyperparameter Tuning

In [None]:
rf_params = {
    'max_depth': np.arange(1, 10), 
    'max_features': [5, 10, 15, 20], 
    'n_estimators': [100, 200, 300, 400, 500, 1000]
}
rf_cv_model = GridSearchCV(RandomForestClassifier(), rf_params, cv=10, n_jobs=-1).fit(X_train, y_train)
rf_cv_model.best_params_

In [None]:
rf_tuned = RandomForestClassifier(max_depth=9, max_features=5, n_estimators=300)
start = time.time()
rf_tuned.fit(X_train, y_train)
end = time.time()
rf_tuned_time = end - start
print("Tuned RandomForestClassifier Train Time:", rf_tuned_time)

In [None]:
rf_tuned_scores = cross_val_score(RandomForestClassifier(max_depth=9, max_features=5, n_estimators=300), X_train, y_train, cv=3)
print("Tuned RandomForestClassifier Cross-Validation Scores:", rf_tuned_scores)

In [None]:
tuned_rf_pred_train = rf_tuned.predict(X_train)
tuned_rf_pred_test = rf_tuned.predict(X_test)
tuned_rf_test_proba = rf_tuned.predict_proba(X_test)

tuned_rf_train_score = accuracy_score(tuned_rf_pred_train, y_train)
tuned_rf_test_score = accuracy_score(tuned_rf_pred_test, y_test)
print("Tuned Random Forest Train Score:", tuned_rf_train_score)
print("tuned Random Forest Test Score:", tuned_rf_test_score)

In [None]:
print(classification_report(y_test, tuned_rf_pred_test, target_names=class_names))

In [None]:
rf_cm = confusion_matrix(y_test, tuned_rf_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Tuned Random Forest Confusion Matrix")
plt.show()

In [None]:
plot_feature_importances(rf_tuned, feature_names=X.columns, x_tick_rotation=90, title="Tuned Random Forest Feature Importances")
plt.show()

In [None]:
rf_tuned_feature_df = pd.DataFrame(zip(X.columns, rf_tuned.feature_importances_), columns=["feature", "feature_importance"])
rf_tuned_feature_df

In [None]:
tuned_features = rf_tuned_feature_df[rf_tuned_feature_df["feature_importance"] > 0.04].sort_values(by="feature_importance", ascending=False).reset_index(drop=True)
tuned_features

In [None]:
tuned_features.feature.unique()

In [None]:
X_tuned = X[tuned_features.feature.unique()]
X_tuned

In [None]:
ss_tuned = StandardScaler()
X_tuned_scaled = ss_tuned.fit_transform(X_tuned)
X_tuned_scaled

In [None]:
y_tuned = y[X_tuned.index]
y_tuned

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tuned_scaled, y_tuned, test_size=0.2, random_state=42)

In [None]:
rf_feat = RandomForestClassifier()
start = time.time()
rf_feat.fit(X_train, y_train)
end = time.time()
rf_feat_time = end - start
print("RandomForestClassifier Train Time:", rf_feat_time)

In [None]:
rf_feat_scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=3)
print("RandomForestClassifier Cross-Validation Scores:", rf_feat_scores)

In [None]:
feat_rf_pred_train = rf_feat.predict(X_train)
feat_rf_pred_test = rf_feat.predict(X_test)
feat_rf_test_proba = rf_feat.predict_proba(X_test)

feat_rf_train_score = accuracy_score(feat_rf_pred_train, y_train)
feat_rf_test_score = accuracy_score(feat_rf_pred_test, y_test)
print("Random Forest Train Score:", feat_rf_train_score)
print("Random Forest Test Score:", feat_rf_test_score)

In [None]:
feat_rf_precision_score = precision_score(y_test, feat_rf_pred_test)
feat_rf_f1_score = f1_score(y_test, feat_rf_pred_test)
feat_rf_recall_score = recall_score(y_test, feat_rf_pred_test)
feat_rf_accuracy_score = accuracy_score(y_test, feat_rf_pred_test)

print("Feat Random Forest Precision Score:", feat_rf_precision_score)
print("Feat Random Forest F1 Score:", feat_rf_f1_score)
print("Feat Random Forest Recall Score:", feat_rf_recall_score)
print("Feat Random Forest Accuracy Score:", feat_rf_accuracy_score)

In [None]:
print(classification_report(y_test, rf_pred_test, target_names=class_names))

In [None]:
rf_cm = confusion_matrix(y_test, feat_rf_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Feat Random Forest Confusion Matrix")
plt.show()

In [None]:
plot_precision_recall(y_test, feat_rf_test_proba)
plt.title("Feat Random Forest Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, feat_rf_test_proba)
plt.title("Feat Random Forest ROC Curve")
plt.show()