In [None]:
# Modeling exam
# Vadim Zhovtanyuk
# Cisco Systems, 2020
# Code file, rel. 3.0

In [None]:
import pandas as pd
import statistics as stat
from pandas import read_csv
from matplotlib import pyplot

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [None]:
#Reading raw data
file = 'training.csv'
data = pd.read_csv(file, sep=',', header=0, parse_dates=True)
data

In [None]:
data["cust"].describe()

In [None]:
data["segment"].describe()

In [None]:
data["vertical"].describe()

In [None]:
data["sub_vertical"].describe()

In [None]:
data["country"].describe()

In [None]:
data["bookings"].describe()

In [None]:
#Checking "segment" column
data["segment"].value_counts()

In [None]:
#Fixing different names for the same country
def fix_segment(column):
    results = {}
    results['column'] = column.name

    #Replacing different names for the same segment
    segment_column = []
    for value in column:
        if value == "enterprise customer":
            segment_column.append("enterprise")
        else:
            segment_column.append(str(value).lower())

    #Creating new column data with 'fixed' entries
    results['data'] = segment_column

    return results  

#Replacing data with new value
temp_data = data.apply(fix_segment)
for m in temp_data:
    data[m['column']] = m['data']

In [None]:
data["segment"].value_counts()

In [None]:
#Removing "cust" column as unique, removing "bookings" column which has too many unique values
#Encoding "segment", "vertical", "sub_vertical" and "purchase" columns
data = data.drop("cust", axis=1)
data = data.drop("bookings", axis=1)

encode = LabelEncoder()

data["segment"] = data["segment"].astype("category")
data["segment"] = encode.fit_transform(data.segment)

data["vertical"] = data["vertical"].astype("category")
data["vertical"] = encode.fit_transform(data.vertical)

data["sub_vertical"] = data["sub_vertical"].astype("category")
data["sub_vertical"] = encode.fit_transform(data.sub_vertical)

data["purchase"] = data["purchase"].astype("category")
data["purchase"] = encode.fit_transform(data.purchase)


data

In [None]:
#Checking "country" column
data["country"].value_counts()

In [None]:
#Fixing different names for the same country
def fix(column):
    results = {}
    results['column'] = column.name

    #Replacing different names for the same country
    country_column = []
    for value in column:
        if value == "usa":
            country_column.append("united states")
        elif value == "deutschland":
            country_column.append("germany")
        else:
            country_column.append(value)

    #Creating new column data with 'fixed' entries
    results['data'] = country_column

    return results  

#Replacing data with new value
temp_data = data.apply(fix)
for m in temp_data:
    data[m['column']] = m['data']

In [None]:
#Encoding "country" column 
data["country"] = data["country"].astype("category")
data["country"] = encode.fit_transform(data.country)

data

In [None]:
data["country"].value_counts()

In [None]:
#Scaling all cloumns
scaler = MinMaxScaler()

data['vert_scaled'] = scaler.fit_transform(data['vertical'].values.reshape(-1,1))
data = data.drop("vertical", axis=1)
data['sub_scaled'] = scaler.fit_transform(data['sub_vertical'].values.reshape(-1,1))
data = data.drop("sub_vertical", axis=1)
data['country_scaled'] = scaler.fit_transform(data['country'].values.reshape(-1,1))
data = data.drop("country", axis=1)

data

In [None]:
data['country_scaled'].describe()

In [None]:
len(list(data["country_scaled"].unique()))

In [None]:
data['vert_scaled'].describe()

In [None]:
len(list(data["vert_scaled"].unique()))

In [None]:
data['sub_scaled'].describe()

In [None]:
len(list(data["sub_scaled"].unique()))

In [None]:
data.describe()

In [None]:
#Visualization
plotting_data = data.drop("purchase", axis=1)
plotting_data.hist(figsize=(12,12)) 
pyplot.show()

In [None]:
plotting_data.plot.kde(figsize=(20,10),subplots=True)
pyplot.show()

In [None]:
plotting_data.plot.box(vert=False, figsize=(12,8))
pyplot.show()

In [None]:
from pandas.plotting import radviz
pyplot.figure(figsize=(30,16))
radviz(data, 'purchase')

In [None]:
#Create feature and target for classification
features_1 = data.drop("purchase", axis=1).values
target_1 = data["purchase"].values

In [None]:
#Split the data for training and testing sets
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(features_1, target_1, test_size = 0.3, random_state = 42, stratify = target_1)

In [None]:
data

In [None]:
#Feature selection
data.head(3)

In [None]:
#X_train has: segment, vert_scaled, sub_scaled, country_scaled
X_train_1[0:3,:]

In [None]:
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2
test = SelectKBest(score_func=chi2, k=3) 
fit = test.fit(X_train_1, y_train_1)

In [None]:
fit.scores_

In [None]:
features = fit.transform(X_train_1)

In [None]:
#Univariate selected 3 best features - vert_scaled, sub_scaled, country_scaled
features[0:3,:]

In [None]:
#Trying to use RFE
from sklearn.feature_selection import RFE 
from sklearn.linear_model import LogisticRegression
model = LogisticRegression() 
rfe = RFE(model, 3) 
fit = rfe.fit(X_train_1, y_train_1)

In [None]:
fit.n_features_

In [None]:
fit.support_

In [None]:
#RFE selected the same 3 best features - vert_scaled, sub_scaled, country_scaled
fit.ranking_

In [None]:
#trying to use PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3) 
fit = pca.fit(X_train_1) 

In [None]:
fit.explained_variance_ratio_ 

In [None]:
fit.components_

In [None]:
#Finding feature importance
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
model = ExtraTreesClassifier() 
model.fit(X_train_1, y_train_1) 

In [None]:
model.feature_importances_

In [None]:
#All feature selection approaches returned the same features to be used for future analysis - vert_scaled, sub_scaled, country_scaled

In [None]:
#Create new feature and target for classification after feature selection done by removing segment. Target is still the same.
target_2 = data["purchase"].values
data_2 = data.drop("segment", axis=1)
features_2 = data_2.drop("purchase", axis=1).values

In [None]:
#Split the new data for training and testing sets
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(features_2, target_2, test_size = 0.3, random_state = 42, stratify = target_2)

In [None]:
plotting_data_2 = data_2.drop("purchase", axis=1)
plotting_data_2.plot.kde(figsize=(20,10),subplots=True)
pyplot.show()

In [None]:
#Create new feature and target for classification after feature selection done by removing country for the testing. Target is still the same.
target_3 = data["purchase"].values
data_3 = data_2.drop("country_scaled", axis=1)
features_3 = data_2.drop("purchase", axis=1).values

In [None]:
#Split the new data for training and testing sets
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(features_3, target_3, test_size = 0.3, random_state = 42, stratify = target_3)

In [None]:
plotting_data_3 = data_3.drop("purchase", axis=1)
plotting_data_3.plot.kde(figsize=(12,8),subplots=True)
pyplot.show()

In [None]:
plotting_data_3.plot.box(vert=False, figsize=(12,8))
pyplot.show()

In [None]:
#Doing model comparison

In [None]:
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC

In [None]:
#prepare models 
models = [] 
models.append(("LR", LogisticRegression())) 
models.append(("LDA", LinearDiscriminantAnalysis())) 
models.append(("KNN", KNeighborsClassifier())) 
models.append(("CART", DecisionTreeClassifier())) 
models.append(("NB", GaussianNB())) 
models.append(("SVM", SVC())) 

In [None]:
#Evaluate each model in turn for feature set 1
results_1 = [] 
names_1 = [] 
scoring = "accuracy" 
for name, model in models: 
   kfold = KFold(n_splits=10, random_state=7, shuffle=True) 
   cv_results = cross_val_score(model, X_train_1, y_train_1, cv=kfold, scoring=scoring)
   results_1.append(cv_results)
   names_1.append(name) 
   msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 
   print(msg)

In [None]:
#Boxplot algorithm comparison for feature set 1
fig = pyplot.figure(figsize=(12,8)) 
fig.suptitle("Algorithm Comparison") 
ax = fig.add_subplot(111) 
pyplot.boxplot(results_1) 
ax.set_xticklabels(names_1) 
pyplot.show()

In [None]:
#Evaluate each model in turn for feature set 2
results_2 = [] 
names_2 = [] 
scoring = "accuracy" 
for name, model in models: 
   kfold = KFold(n_splits=10, random_state=7, shuffle=True) 
   cv_results = cross_val_score(model, X_train_2, y_train_2, cv=kfold, scoring=scoring)
   results_2.append(cv_results)
   names_2.append(name) 
   msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 
   print(msg)

In [None]:
#Boxplot algorithm comparison for feature set 2
fig = pyplot.figure(figsize=(12,8)) 
fig.suptitle("Algorithm Comparison") 
ax = fig.add_subplot(111) 
pyplot.boxplot(results_2) 
ax.set_xticklabels(names_2) 
pyplot.show()

In [None]:
#Evaluate each model in turn for feature set 3
results_3 = [] 
names_3 = [] 
scoring = "accuracy" 
for name, model in models: 
   kfold = KFold(n_splits=10, random_state=7, shuffle=True) 
   cv_results = cross_val_score(model, X_train_3, y_train_3, cv=kfold, scoring=scoring)
   results_3.append(cv_results)
   names_3.append(name) 
   msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 
   print(msg)

In [None]:
#Boxplot algorithm comparison for feature set 3
fig = pyplot.figure(figsize=(12,8)) 
fig.suptitle("Algorithm Comparison") 
ax = fig.add_subplot(111) 
pyplot.boxplot(results_3) 
ax.set_xticklabels(names_3) 
pyplot.show()

In [None]:
#Evaluating 6 different algorithms shows LR, LDA, NB and SVM give the same score for all feature sets
#Selecting LDA to use pipeline for further testing

In [None]:
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
#Create pipeline 
estimators = [] 
estimators.append(("standardize", StandardScaler())) 
estimators.append(("lda", LinearDiscriminantAnalysis())) 
model = Pipeline(estimators)

In [None]:
#Evaluate pipeline for feature set 1
kfold = KFold(n_splits=10, random_state=7, shuffle=True) 
results = cross_val_score(model, X_train_1, y_train_1, cv=kfold) 
print(results.mean())