In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import holidays
from sklearn.utils import resample


In [None]:
df = pd.read_csv("/content/shope1.csv")

In [None]:
turkey_holidays = holidays.Turkey()
df['invoice_date'] = pd.to_datetime(df['invoice_date'], format='mixed')
df['Holiday_Period'] = df['invoice_date'].apply(lambda x: 1 if x in turkey_holidays else 0)

In [None]:
label_enc = LabelEncoder()
df['shopping_mall'] = label_enc.fit_transform(df['shopping_mall'])


In [None]:
categorical_cols = ['gender', 'payment_method', 'product_name']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [None]:
max_count = df['shopping_mall'].value_counts().max()
resampled_dfs = []
for mall in df['shopping_mall'].unique():
    mall_df = df[df['shopping_mall'] == mall]
    resampled_mall_df = resample(mall_df, replace=True, n_samples=max_count, random_state=42)
    resampled_dfs.append(resampled_mall_df)
balanced_df = pd.concat(resampled_dfs)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df = balanced_df
df.head()

Unnamed: 0,invoice_no,customer_id,age,category,quantity,price,invoice_date,shopping_mall,review_rating,Holiday_Period,...,product_name_Sandals,product_name_Science Book,product_name_Smart Speaker,product_name_Sneakers,product_name_Snow Globe,product_name_Sweater,product_name_T-Shirt,product_name_T-shirt,product_name_Tablet,product_name_VR Headset
0,I204523,C805594,54,Toys,3,107.52,2021-05-10,4,4.225474,0,...,False,False,False,False,False,False,False,False,False,False
1,I114827,C869172,66,Shoes,4,2400.68,2021-03-14,8,3.262274,0,...,False,False,False,False,False,False,False,False,False,False
2,I249523,C123624,25,Clothing,5,1500.4,2021-09-29,3,2.936209,0,...,False,False,False,False,False,False,False,True,False,False
3,I323594,C962118,61,Food & Beverage,2,10.46,2021-04-28,7,3.391872,0,...,False,False,False,False,False,False,False,False,False,False
4,I284260,C237616,32,Food & Beverage,1,5.23,2022-02-03,0,2.947601,0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

scaler = StandardScaler()
df['age_scaled'] = scaler.fit_transform(df[['age']])

In [None]:
X = df.drop(columns=['shopping_mall', 'invoice_no', 'customer_id', 'invoice_date', 'category'])  # Drop target and non-numeric columns from features
y = df['shopping_mall']  # Encoded target

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)



In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

models = {'Random Forest': RandomForestClassifier(n_estimators=200, random_state=1)}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy of {name}: {accuracy:.2f}')
    print(classification_report(y_test, y_pred))

Accuracy of Random Forest: 0.84
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      3988
           1       0.90      0.97      0.93      3988
           2       0.89      0.96      0.92      3989
           3       0.81      0.82      0.82      3989
           4       0.75      0.63      0.68      3989
           5       0.75      0.62      0.68      3988
           6       0.78      0.69      0.73      3989
           7       0.82      0.83      0.83      3989
           8       0.89      0.97      0.93      3988
           9       0.88      0.96      0.92      3989

    accuracy                           0.84     39886
   macro avg       0.83      0.84      0.84     39886
weighted avg       0.83      0.84      0.84     39886

