In [None]:
# Modeling exam
# Vadim Zhovtanyuk
# Cisco Systems, 2020

In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import statistics as stat
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2
import sys

In [None]:
#Loading the data into pandas dataframe
file = 'dnac-customer-bookings.csv'
data = pd.read_csv(file, sep=',', header=0, parse_dates=True)

In [None]:
#First look at the data
data

In [None]:
#Removing cust column which is unique in each row and hence not useful for further analysis
data = data.drop("cust", axis=1)

In [None]:
#Checking if there are any "0" values
data.isnull().any()

In [None]:
#Checking "segment" column
data["segment"].value_counts()

In [None]:
#Fixing different names for the same segment
def fix(column):
    results = {}
    results['column'] = column.name

    #Replacing the "wrong" names
    segment_column = []
    for value in column:
        if value == "enterprise customer":
            segment_column.append("Enterprise")
        elif value == "enterprise":
            segment_column.append("Enterprise")
        elif value == "commercial":
            segment_column.append("Commercial")
        else:
            segment_column.append(value)

    #Creating new column data with 'fixed' entries
    results['data'] = segment_column

    return results

#Replacing data with new value
missing_data = data.apply(fix)
for m in missing_data:
    data[m['column']] = m['data']

In [None]:
#Verification of segment name
data["segment"].value_counts()

In [None]:
#List of segments
segment_list = data['segment'].unique()
segment_list

In [None]:
#Encoding segment column to prepare data for further analysis 
data["segment"] = data["segment"].astype("category")
segment_encode = LabelEncoder()
data["segment"] = segment_encode.fit_transform(data.segment)
segment_one_hot = OneHotEncoder()
segment_one_hot_encode = segment_one_hot.fit_transform(data.segment.values.reshape(-1,1)).toarray()
ohe_variable = pd.DataFrame(segment_one_hot_encode, columns = ["segment_" + str(int(i)) for i in range (segment_one_hot_encode.shape[1])])
data = pd.concat([data, ohe_variable], axis=1)
data = data.drop("segment", axis=1)

In [None]:
data

In [None]:
#Checking "vertical" column
data['vertical'].value_counts()

In [None]:
#List of verticals
vertical_list = data['vertical'].unique()
vertical_list

In [None]:
#Encoding "vertical" column to prepare data for further analysis 
data["vertical"] = data["vertical"].astype("category")
vertical_encode = LabelEncoder()
data["vertical"] = vertical_encode.fit_transform(data.vertical)
vertical_one_hot = OneHotEncoder()
vertical_one_hot_encode = vertical_one_hot.fit_transform(data.vertical.values.reshape(-1,1)).toarray()
ohe_variable = pd.DataFrame(vertical_one_hot_encode, columns = ["vertical_" + str(int(i)) for i in range (vertical_one_hot_encode.shape[1])])
data = pd.concat([data, ohe_variable], axis=1)
data = data.drop("vertical", axis=1)

In [None]:
data

In [None]:
#Checking "sub_vertical" column
data["sub_vertical"].value_counts()

In [None]:
#List of sub_verticals
subvertical_list = data["sub_vertical"].unique()
subvertical_list

In [None]:
#Encoding "sub_vertical" column to prepare data for further analysis 
data["sub_vertical"] = data["sub_vertical"].astype("category")
sub_vertical_encode = LabelEncoder()
data["sub_vertical"] = sub_vertical_encode.fit_transform(data.sub_vertical)
sub_vertical_one_hot = OneHotEncoder()
sub_vertical_one_hot_encode = sub_vertical_one_hot.fit_transform(data.sub_vertical.values.reshape(-1,1)).toarray()
ohe_variable = pd.DataFrame(sub_vertical_one_hot_encode, columns = ["sub_vertical_" + str(int(i)) for i in range (sub_vertical_one_hot_encode.shape[1])])
data = pd.concat([data, ohe_variable], axis=1)
data = data.drop("sub_vertical", axis=1)

In [None]:
data

In [None]:
#Checking "country" column
data["country"].value_counts()

In [None]:
#Fixing different names for the same country issues
def fix(column):
    results = {}
    results['column'] = column.name

    #Replacing different names for the same country
    country_column = []
    for value in column:
        if value == "USA":
            country_column.append("UNITED STATES")
        elif value == "DEUTSCHLAND":
            country_column.append("GERMANY")
        else:
            country_column.append(value)

    #Creating new column data with 'fixed' entries
    results['data'] = country_column

    return results  

#Replacing data with new value
temp_data = data.apply(fix)
for m in temp_data:
    data[m['column']] = m['data']

In [None]:
#Verification of country names
data["country"].value_counts()

In [None]:
#List of countries
country_list = data["country"].unique()
country_list

In [None]:
#Encoding "country" column to prepare data for further analysis 
data["country"] = data["country"].astype("category")
country_encode = LabelEncoder()
data["country"] = country_encode.fit_transform(data.country)
country_one_hot = OneHotEncoder()
country_one_hot_encode = country_one_hot.fit_transform(data.country.values.reshape(-1,1)).toarray()
ohe_variable = pd.DataFrame(country_one_hot_encode, columns = ["country_" + str(int(i)) for i in range (country_one_hot_encode.shape[1])])
data = pd.concat([data, ohe_variable], axis=1)
data = data.drop("country", axis=1)

In [None]:
data

In [None]:
#Checking "bookings"
data["bookings"].value_counts()

In [None]:
#Replacing "unknown" and negative values with pre-defined value "1010101"
def fix(column):
    results = {}
    results['column'] = column.name

    fix_column = []
    for value in column:
        if value == "unknown":
            fix_column.append("1010101")
        elif str(value).startswith("-"):
            fix_column.append("1010101")
        else:
            fix_column.append(value)

    #Creating new column data with 'fixed' entries
    results['data'] = fix_column

    return results  

#Replacing data with new value
temp_data = data.apply(fix)
for m in temp_data:
    data[m['column']] = m['data']

In [None]:
#Verification of bookings values
data["bookings"].value_counts()

In [None]:
#Fixing wrong data in bookings 
def missing(column):
    results = {}
    results['column'] = column.name

    #Step1 -  Removing wrong entries and calcualting median 
    s1_column = [x for x in column if x != 1010101]
    results['removed_median'] = stat.median(s1_column)
    
    #Step2 - Replace wrong entries with (removed) median
    s2_column = []
    for value in column:
        if value != 1010101:
            s2_column.append(value)
        else:
            s2_column.append(results['removed_median'])

    #Creating new column data with 'fixed' entries
    results['data'] = s2_column

    return results

#Replace missing data with mean value
ds = data.astype({'bookings': 'int64'})
missing_data = ds.select_dtypes(include=['float64', 'int64']).apply(missing)
for m in missing_data:
    ds[m['column']] = m['data']  
    
data = ds.astype({'bookings': 'int64'})    

In [None]:
data

In [None]:
#Purchase list
purchase_list = data["purchase"].unique()
purchase_list

In [None]:
#Encoding "purchase" column to prepare data for further analysis 
data["purchase"] = data["purchase"].astype("category")
purchase_encode = LabelEncoder()
data["purchase"] = purchase_encode.fit_transform(data.purchase)
purchase_one_hot = OneHotEncoder()
purchase_one_hot_encode = purchase_one_hot.fit_transform(data.purchase.values.reshape(-1,1)).toarray()
ohe_variable = pd.DataFrame(purchase_one_hot_encode, columns = ["purchase_" + str(int(i)) for i in range (purchase_one_hot_encode.shape[1])])
data = pd.concat([data, ohe_variable], axis=1)
data = data.drop("purchase", axis=1)

In [None]:
data

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.groupby("purchase").size()

In [None]:
data.corr(method='pearson')

In [None]:
data.skew()

In [None]:
data["bookings"].hist()
pyplot.show()

In [None]:
correlations = data.corr()
fig = pyplot.figure() 
ax = fig.add_subplot(111) 
cax = ax.matshow(correlations, vmin=-1, vmax=1) 
fig.colorbar(cax) 
ticks = np.arange(0,9,1) 
ax.set_xticks(ticks) 
ax.set_yticks(ticks) 
pyplot.show()

In [None]:
data

In [None]:
# [END]