In [7]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np

In [8]:
df = pd.read_csv('coding_round_data.csv')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [9]:
# This dataset does not have any nan values
name_list = []
nan_list = []
for col in df.columns:
    if not df[col].isnull().values.any():
        name_list.append(col)
    else:
        nan_list.append(col)   
        
new_df = df[name_list]
nan_df = df[nan_list]

In [10]:
# Pop target
target = df.pop('Revenue')

In [45]:
# separate categorical column and numerical column
# cat_cols = df.select_dtypes("object").columns
num_cols = df.select_dtypes("number").columns

num_df = df[num_cols]
cat_df = df.loc[:, ~df.columns.isin(num_df)]


In [46]:
cat_df.head()

Unnamed: 0,Month,VisitorType,Weekend
0,Feb,Returning_Visitor,False
1,Feb,Returning_Visitor,False
2,Feb,Returning_Visitor,False
3,Feb,Returning_Visitor,False
4,Feb,Returning_Visitor,True


In [47]:
#summarize categorical variables
cat_df.describe()

Unnamed: 0,Month,VisitorType,Weekend
count,12330,12330,12330
unique,10,3,2
top,May,Returning_Visitor,False
freq,3364,10551,9462


In [48]:
#print counts of each categorical variable
for col in cat_df:
    print(cat_df[col].value_counts(dropna=False),'\n')

May     3364
Nov     2998
Mar     1907
Dec     1727
Oct      549
Sep      448
Aug      433
Jul      432
June     288
Feb      184
Name: Month, dtype: int64 

Returning_Visitor    10551
New_Visitor           1694
Other                   85
Name: VisitorType, dtype: int64 

False    9462
True     2868
Name: Weekend, dtype: int64 



In [49]:
# create dummy variables
month_dummy = pd.get_dummies(cat_df['Month'])
visitor_dummy = pd.get_dummies(cat_df['VisitorType'])
weekend_dummy = pd.get_dummies(cat_df['Weekend'])
# cat_df.drop(['x3','x60','x65'],axis = 1, inplace = True)

# cat_df = pd.concat([num_df,month_dummy,visitor_dummy,weekend_dummy],axis = 1)

In [11]:
num_col = ['Administrative','Administrative_Duration','Informational','Informational_Duration','ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues','SpecialDay']
num_df = df[num_col]
cat_df = df.loc[:,~df.columns.isin(num_col)]
# cat_df = pd.concat([df.loc[:,~df.columns.isin(num_col)]],axis = 1)
# cat_df.drop(labels, axis, index, columns, level, inplace., errors)

In [12]:
cat_df

Unnamed: 0,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,Feb,1,1,1,1,Returning_Visitor,False
1,Feb,2,2,1,2,Returning_Visitor,False
2,Feb,4,1,9,3,Returning_Visitor,False
3,Feb,3,2,2,4,Returning_Visitor,False
4,Feb,3,3,1,4,Returning_Visitor,True
...,...,...,...,...,...,...,...
12325,Dec,4,6,1,1,Returning_Visitor,True
12326,Nov,3,2,1,8,Returning_Visitor,True
12327,Nov,3,2,1,13,Returning_Visitor,True
12328,Nov,2,2,3,11,Returning_Visitor,False


In [54]:
cat_df = pd.concat([num_df,month_dummy,visitor_dummy,weekend_dummy],axis = 1)

In [22]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.metrics import roc_auc_score


# from sklearn.linear_model import RandomForestRe
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [14]:
#split x and y data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(num_df,
                                  target,
                                  test_size=0.2,
                                  random_state=1)

In [15]:
# scale numerical data
scalar1 = StandardScaler()
X_train_scaled= pd.DataFrame(scalar1.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scalar1.transform(X_test))

In [16]:
X_train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.695075,-0.457171,-0.396532,-0.248563,-0.621633,-0.562207,-0.458434,0.143270,-0.321857,-0.309779
1,-0.695075,-0.457171,-0.396532,-0.248563,-0.688681,-0.615738,3.663888,3.230346,-0.321857,-0.309779
2,-0.695075,-0.457171,-0.396532,-0.248563,-0.688681,-0.615738,3.663888,3.230346,-0.321857,-0.309779
3,1.102885,0.149714,-0.396532,-0.248563,-0.465186,-0.537861,-0.343925,-0.062535,-0.321857,-0.309779
4,0.803225,-0.194332,-0.396532,-0.248563,-0.331089,-0.502758,-0.229417,-0.428411,-0.321857,-0.309779
...,...,...,...,...,...,...,...,...,...,...
9859,0.503565,1.784028,-0.396532,-0.248563,-0.576934,-0.401874,-0.458434,-0.628500,-0.321857,-0.309779
9860,0.203905,0.404829,3.520343,1.468349,0.249998,0.962344,-0.458434,-0.645650,-0.142292,-0.309779
9861,-0.395415,-0.397820,-0.396532,-0.248563,0.384095,0.283530,0.111674,0.102401,-0.321857,0.696033
9862,-0.095755,-0.245204,-0.396532,-0.248563,0.853434,2.670379,-0.114908,-0.313806,-0.321857,-0.309779


In [20]:
#Setup recursive feature reduction w/ cross validation
clf2 = RFECV(LogisticRegression(max_iter=200),
      scoring = 'roc_auc',
      n_jobs = -1,
      cv = 3,
      step = 1)

In [23]:
#Generate predicted probabilites for features without NAN
clf2.fit(X_train_scaled, y_train)

clf2_probs = clf2.predict_proba(X_test_scaled)
print('AUC: ', roc_auc_score(y_test, clf2_probs[:,1]))

AUC:  0.8951957594983599


In [26]:
reg = LogisticRegression()
reg.fit(X_train_scaled,y_train)
reg_probs = reg.predict_proba(X_test_scaled)
print('AUC: ', roc_auc_score(y_test, reg_probs[:,1]))

AUC:  0.8957480484667245


In [74]:
#split x and y data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(df,
                                  target,
                                  test_size=0.2,
                                  random_state=1)

In [75]:
num_col = ['Administrative','Administrative_Duration','Informational','Informational_Duration','ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues','SpecialDay']
num_df_train = X_train[num_col]
num_df_test = X_test[num_col]
cat_df_train = X_train.loc[:,~X_train.columns.isin(num_col)]
cat_df_test = X_test.loc[:,~X_test.columns.isin(num_col)]

In [76]:
num_df_test

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay
5487,0,0.0,0,0.0,1,0.000000,0.100000,0.200000,0.000000,0.0
7736,5,1415.5,0,0.0,34,588.366667,0.000000,0.030247,8.041078,0.0
5310,0,0.0,0,0.0,2,73.000000,0.000000,0.050000,0.000000,0.0
2982,2,98.0,0,0.0,20,401.500000,0.000000,0.003333,0.000000,0.0
662,4,56.5,0,0.0,8,202.166667,0.000000,0.010000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...
6688,3,106.6,0,0.0,11,203.980000,0.000000,0.004444,0.000000,0.0
1305,0,0.0,0,0.0,5,210.000000,0.120000,0.160000,0.000000,0.0
4434,3,135.5,0,0.0,37,1238.375000,0.005405,0.030511,7.941532,0.0
9358,0,0.0,0,0.0,10,383.000000,0.000000,0.050000,0.000000,0.0


In [77]:
# create dummy variables for trainining set
month_dummy = pd.get_dummies(cat_df_train['Month'])
visitor_dummy = pd.get_dummies(cat_df_train['VisitorType'])
weekend_dummy = pd.get_dummies(cat_df_train['Weekend'])
cat_df_train.drop(['Month','VisitorType','Weekend'],axis = 1, inplace = True)

cat_df_train = pd.concat([cat_df_train,month_dummy,visitor_dummy,weekend_dummy],axis = 1)

# create dummy variables for test set
month_dummy = pd.get_dummies(cat_df_test['Month'])
visitor_dummy = pd.get_dummies(cat_df_test['VisitorType'])
weekend_dummy = pd.get_dummies(cat_df_test['Weekend'])
cat_df_test.drop(['Month','VisitorType','Weekend'],axis = 1, inplace = True)

cat_df_test = pd.concat([cat_df_test,month_dummy,visitor_dummy,weekend_dummy],axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [83]:
cat_df_test

Unnamed: 0,OperatingSystems,Browser,Region,TrafficType,Aug,Dec,Feb,Jul,June,Mar,May,Nov,Oct,Sep,New_Visitor,Other,Returning_Visitor,False,True
5487,2,2,2,5,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
7736,2,5,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0
5310,1,1,1,3,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0
2982,2,5,1,2,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1
662,3,2,1,8,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6688,2,2,1,3,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
1305,1,10,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4434,2,2,1,2,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0
9358,2,2,9,2,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1


In [80]:
# scale numerical data
scalar1 = StandardScaler()
X_train_scaled= pd.DataFrame(scalar1.fit_transform(num_df_train))
X_test_scaled = pd.DataFrame(scalar1.transform(num_df_test))

In [89]:
X_train_processed = pd.concat([X_train_scaled, cat_df_train], axis = 1)
X_test_processed = pd.concat([X_test_scaled, cat_df_test], axis = 1)

In [91]:
X_train_scaled.me

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Mar,May,Nov,Oct,Sep,New_Visitor,Other,Returning_Visitor,False,True
0,-0.695075,-0.457171,-0.396532,-0.248563,-0.688681,-0.615738,1.602727,3.230346,-0.321857,-0.309779,...,,,,,,,,,,
1,0.803225,7.543880,-0.396532,-0.248563,0.048852,-0.312898,-0.458434,-0.263259,0.114222,-0.309779,...,,,,,,,,,,
2,-0.695075,-0.457171,-0.396532,-0.248563,-0.666332,-0.578163,-0.458434,0.143270,-0.321857,-0.309779,...,,,,,,,,,,
3,-0.095755,0.096770,-0.396532,-0.248563,-0.264041,-0.409080,-0.458434,-0.817154,-0.321857,-0.309779,...,,,,,,,,,,
4,0.503565,-0.137807,-0.396532,-0.248563,-0.532235,-0.511680,-0.458434,-0.679951,-0.321857,-0.309779,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12319,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
12321,,,,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
12322,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
12324,,,,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [84]:
cat_df_train

Unnamed: 0,OperatingSystems,Browser,Region,TrafficType,Aug,Dec,Feb,Jul,June,Mar,May,Nov,Oct,Sep,New_Visitor,Other,Returning_Visitor,False,True
3339,2,10,2,2,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0
10953,3,2,3,2,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
8536,4,1,3,3,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0
7766,2,2,1,3,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0
8280,3,2,2,13,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,1,1,3,8,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
905,1,2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1
5192,2,4,1,4,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0
12172,1,1,3,8,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0
