In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
his = pd.read_csv("/content/drive/MyDrive/college/dsw_ml_test/data/historic.csv")

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [5]:
numeric_features = ['stars']
categorical_features = ['category', 'main_promotion', 'color']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

X = his.drop('success_indicator', axis=1)
y = his['success_indicator']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print(classification_report(y_test, y_pred))

Accuracy: 0.81875
              precision    recall  f1-score   support

        flop       0.78      0.69      0.73       571
         top       0.84      0.89      0.86      1029

    accuracy                           0.82      1600
   macro avg       0.81      0.79      0.80      1600
weighted avg       0.82      0.82      0.82      1600



In [6]:
from sklearn.ensemble import IsolationForest


### Doing all while revmoving main_promotion colums

In [31]:
categorical_cols = ['category', 'color']

In [30]:
df = his[['category', 'color', 'stars','success_indicator']]

In [32]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [34]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, ['stars'])
    ])

In [35]:
clf = RandomForestClassifier()

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', clf)])

In [36]:
X = df.drop('success_indicator', axis=1)
y = df['success_indicator']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
pipeline.fit(X_train , y_train)

In [39]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.825625


In [40]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [44]:
classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Logistic Regression': LogisticRegression()
}


In [45]:
pipelines = {}
for clf_name, clf in classifiers.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', clf)])
    pipelines[clf_name] = pipeline


In [46]:
for clf_name, pipeline in pipelines.items():
    print(f"Training and evaluating {clf_name}...")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    print(classification_report(y_test, y_pred))
    print("----------------------------------------")


Training and evaluating SVM...
Accuracy: 0.840625
              precision    recall  f1-score   support

        flop       0.82      0.71      0.76       571
         top       0.85      0.91      0.88      1029

    accuracy                           0.84      1600
   macro avg       0.83      0.81      0.82      1600
weighted avg       0.84      0.84      0.84      1600

----------------------------------------
Training and evaluating KNN...
Accuracy: 0.82375
              precision    recall  f1-score   support

        flop       0.79      0.68      0.73       571
         top       0.84      0.90      0.87      1029

    accuracy                           0.82      1600
   macro avg       0.82      0.79      0.80      1600
weighted avg       0.82      0.82      0.82      1600

----------------------------------------
Training and evaluating Gradient Boosting...
Accuracy: 0.81875
              precision    recall  f1-score   support

        flop       0.79      0.67      0.73    

1. SVM (Support Vector Machine):

SVM achieved an accuracy of 84.06%.
It demonstrated balanced performance with precision, recall, and F1-scores of 0.82 for 'flop' and 0.85 for 'top'.

2. KNN (K-Nearest Neighbors):

KNN achieved an accuracy of 82.38%.
It showed slightly lower precision, recall, and F1-scores compared to SVM, particularly for the 'flop' class.

3. Gradient Boosting:

Gradient Boosting achieved an accuracy of 81.88%.
While it showed a good recall for the 'top' class, it exhibited lower performance for the 'flop' class compared to SVM and KNN.

4. AdaBoost:

AdaBoost achieved an accuracy of 81.06%.
It showed balanced performance with similar precision, recall, and F1-scores for both classes, though slightly lower compared to SVM and KNN.

5. Logistic Regression:

Logistic Regression achieved an accuracy of 81.63%.
Similar to AdaBoost, it demonstrated balanced performance but with slightly lower precision and recall for the 'flop' class.

Overall, SVM performed the best among the classifiers evaluated, with the highest accuracy and balanced performance across precision, recall, and F1-scores for both classes. KNN also showed competitive performance, although slightly lower than SVM. Gradient Boosting, AdaBoost, and Logistic Regression exhibited slightly lower accuracy and performance compared to SVM and KNN, with varying degrees of class imbalance in precision, recall, and F1-scores.