# <font color = bluee> 01. Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# <font color = bluee> 02. Import Data

In [2]:
cus_data_tr = pd.read_csv('train_wn75k28.csv') # original training data
cus_data_ts = pd.read_csv('test_Wf7sxXF.csv')  # original testing data

In [3]:
tr_copy = cus_data_tr.copy() # making a copy for backup

In [4]:
cus_data_tr.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,1,2021-01-01,1,2,2.0,2020-09-24,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2,2021-01-01,2,1,2.0,2020-09-19,1,0,1,0,0,0,1,0,0,0,0,0,0
2,3,2021-01-01,9,3,3.0,2021-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,2021-01-01,6,7,2.0,2017-10-04,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,2021-01-01,4,6,,2020-06-08,0,0,0,0,0,0,1,0,0,0,1,0,0


# <font color = bluee> 03. Initial Analysis

In [5]:
cus_data_tr.shape , cus_data_ts.shape

((39161, 19), (13184, 18))

In [6]:
cus_data_tr.dtypes

id                        int64
created_at               object
campaign_var_1            int64
campaign_var_2            int64
products_purchased      float64
signup_date              object
user_activity_var_1       int64
user_activity_var_2       int64
user_activity_var_3       int64
user_activity_var_4       int64
user_activity_var_5       int64
user_activity_var_6       int64
user_activity_var_7       int64
user_activity_var_8       int64
user_activity_var_9       int64
user_activity_var_10      int64
user_activity_var_11      int64
user_activity_var_12      int64
buy                       int64
dtype: object

# <font color = bluee> 04. Data Preprocessing

In [7]:
cus_data_tr.drop(labels='id', axis=1, inplace=True)

In [8]:
cus_data_tr.head(3)

Unnamed: 0,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,2021-01-01,1,2,2.0,2020-09-24,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2021-01-01,2,1,2.0,2020-09-19,1,0,1,0,0,0,1,0,0,0,0,0,0
2,2021-01-01,9,3,3.0,2021-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# converting dates into 'datetime' format from 'object'
cus_data_tr['created_at'] = pd.to_datetime(cus_data_tr['created_at'], infer_datetime_format=True)
cus_data_tr['signup_date'] = pd.to_datetime(cus_data_tr['signup_date'], infer_datetime_format=True)

In [10]:
cus_data_tr.tail(3)

Unnamed: 0,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
39158,2021-12-31,8,7,2.0,NaT,1,0,0,0,1,0,1,0,0,0,0,0,0
39159,2021-12-31,7,12,2.0,NaT,0,0,0,0,0,1,0,0,0,0,1,0,0
39160,2021-12-31,2,5,,2019-08-11,1,0,0,0,0,0,1,0,0,0,0,0,0


In [11]:
cus_data_tr.dtypes

created_at              datetime64[ns]
campaign_var_1                   int64
campaign_var_2                   int64
products_purchased             float64
signup_date             datetime64[ns]
user_activity_var_1              int64
user_activity_var_2              int64
user_activity_var_3              int64
user_activity_var_4              int64
user_activity_var_5              int64
user_activity_var_6              int64
user_activity_var_7              int64
user_activity_var_8              int64
user_activity_var_9              int64
user_activity_var_10             int64
user_activity_var_11             int64
user_activity_var_12             int64
buy                              int64
dtype: object

In [12]:
cus_data_tr.isna().sum()

created_at                  0
campaign_var_1              0
campaign_var_2              0
products_purchased      20911
signup_date             15113
user_activity_var_1         0
user_activity_var_2         0
user_activity_var_3         0
user_activity_var_4         0
user_activity_var_5         0
user_activity_var_6         0
user_activity_var_7         0
user_activity_var_8         0
user_activity_var_9         0
user_activity_var_10        0
user_activity_var_11        0
user_activity_var_12        0
buy                         0
dtype: int64

In [13]:
cus_data_tr.drop(labels='products_purchased', axis=1, inplace=True)

In [14]:
cus_data_tr.isna().sum()

created_at                  0
campaign_var_1              0
campaign_var_2              0
signup_date             15113
user_activity_var_1         0
user_activity_var_2         0
user_activity_var_3         0
user_activity_var_4         0
user_activity_var_5         0
user_activity_var_6         0
user_activity_var_7         0
user_activity_var_8         0
user_activity_var_9         0
user_activity_var_10        0
user_activity_var_11        0
user_activity_var_12        0
buy                         0
dtype: int64

In [15]:
cus_data_tr.drop(labels='created_at', axis=1, inplace=True)
cus_data_tr.drop(labels='signup_date', axis=1, inplace=True)

In [16]:
cus_data_tr.isna().sum()

campaign_var_1          0
campaign_var_2          0
user_activity_var_1     0
user_activity_var_2     0
user_activity_var_3     0
user_activity_var_4     0
user_activity_var_5     0
user_activity_var_6     0
user_activity_var_7     0
user_activity_var_8     0
user_activity_var_9     0
user_activity_var_10    0
user_activity_var_11    0
user_activity_var_12    0
buy                     0
dtype: int64

# <font color = bluee> 05. Model Building

In [17]:
X = cus_data_tr.drop(labels='buy', axis=1)

In [18]:
y = cus_data_tr[['buy']]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True, stratify=y)

In [20]:
X_train.shape, y_train.shape

((31328, 14), (31328, 1))

In [21]:
X_test.shape, y_test.shape

((7833, 14), (7833, 1))

In [22]:
X.shape

(39161, 14)

# <font color = bluee> 06. Model Training || 07. Model Testing || 08. Model Evaluation

In [23]:
model_1 = LogisticRegression(C=1)
model_1.fit(X_train, y_train)
y_pred_train = model_1.predict(X_train)
y_pred_test = model_1.predict(X_test)
print('Training accuracy is', accuracy_score(y_train, y_pred_train))
print('-------------------------------------------------------')
print('Confusion Matrix for training data is \n', confusion_matrix(y_train, y_pred_train))
print('-------------------------------------------------------')
print('Classification report for training data \n', (classification_report(y_train, y_pred_train)))
print('=======================================================')
print('Testing accuracy for is',accuracy_score(y_test, y_pred_test))
print('-------------------------------------------------------')
print('Confusion Matrix for testing data is \n', confusion_matrix(y_test, y_pred_test))
print('-------------------------------------------------------')
print('Classification report for testing data \n', (classification_report(y_test, y_pred_test)))

Training accuracy is 0.9731550051072523
-------------------------------------------------------
Confusion Matrix for training data is 
 [[29693    37]
 [  804   794]]
-------------------------------------------------------
Classification report for training data 
               precision    recall  f1-score   support

           0       0.97      1.00      0.99     29730
           1       0.96      0.50      0.65      1598

    accuracy                           0.97     31328
   macro avg       0.96      0.75      0.82     31328
weighted avg       0.97      0.97      0.97     31328

Testing accuracy for is 0.9737010085535555
-------------------------------------------------------
Confusion Matrix for testing data is 
 [[7428    5]
 [ 201  199]]
-------------------------------------------------------
Classification report for testing data 
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      7433
           1       0.98      0.50    

# <font color = bluee> 09. Exporting .csv file

In [24]:
cus_ts_co = cus_data_ts.copy()

In [25]:
X_test.columns

Index(['campaign_var_1', 'campaign_var_2', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12'],
      dtype='object')

In [26]:
cus_ts_co.columns

Index(['id', 'created_at', 'campaign_var_1', 'campaign_var_2',
       'products_purchased', 'signup_date', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12'],
      dtype='object')

In [27]:
cus_ts_co.drop(labels=['id', 'created_at','products_purchased', 'signup_date'], axis=1, inplace=True)
cus_ts_co.columns

Index(['campaign_var_1', 'campaign_var_2', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12'],
      dtype='object')

In [28]:
X_test.columns == cus_ts_co.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [29]:
y_for_final_test = model_1.predict(cus_ts_co)

In [30]:
y_for_final_test

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
cus_data_ts['buy'] = y_for_final_test

In [32]:
cus_data_ts

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,39162,2022-01-01,2,2,,2021-08-17,1,1,0,0,0,1,0,0,0,0,1,0,1
1,39163,2022-01-01,4,7,3.0,2020-05-21,1,0,0,0,0,0,0,1,0,0,0,0,0
2,39164,2022-01-01,8,7,,,0,0,0,0,1,1,0,0,0,0,0,0,0
3,39165,2022-01-01,9,8,2.0,2020-06-22,0,0,0,0,1,1,1,0,0,0,2,0,1
4,39166,2022-01-01,4,5,2.0,2021-03-10,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13179,52341,2022-03-30,16,12,,,0,0,0,0,0,1,1,0,0,0,0,0,0
13180,52342,2022-03-30,12,8,,2022-03-19,1,0,0,0,0,1,0,0,0,0,0,0,0
13181,52343,2022-03-30,14,14,3.0,2021-08-15,0,0,1,0,0,0,1,0,0,0,0,0,0
13182,52344,2022-03-30,14,10,,,0,0,0,0,0,0,1,0,0,0,1,0,0


In [33]:
cus_data_ts.to_csv('final_test_buy.csv')

# <font color = bluee> ========================= END ============================