In [31]:
import os
for dirname, _, filenames in os.walk('/content/abtest-mlops/data'):
    for filename in filenames:
      files_dic = os.path.join(dirname, filename)
      print(files_dic)

/content/abtest-mlops/data/split_AdSmartABdata.csv
/content/abtest-mlops/data/.gitignore
/content/abtest-mlops/data/samsug_internetAdSmartABdata.csv
/content/abtest-mlops/data/AdSmartABdata.csv.dvc
/content/abtest-mlops/data/Chrome_MobileAdSmartABdata.csv
/content/abtest-mlops/data/Chrome_WebViewAdSmartABdata.csv
/content/abtest-mlops/data/FacebookAdSmartABdata.csv


In [32]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np

# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder

# To fill missing values
from sklearn.impute import SimpleImputer

# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score


# We are importing our Data with Pandas Library
# We use "Coronary_artery.csv" 
df = pd.read_csv("/content/abtest-mlops/data/Chrome_MobileAdSmartABdata.csv")

In [33]:
df  = df.drop(['auction_id','Unnamed: 0'], axis = 1)


In [34]:
df.head()

Unnamed: 0,experiment,date,hour,device_make,platform_os,browser,yes,no
0,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0
1,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1
2,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1
3,exposed,2020-07-10,2,Generic Smartphone,6,Chrome Mobile,0,1
4,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,1,0


In [35]:
numerical_column = df.select_dtypes(exclude="object").columns.tolist()
categorical_column = df.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("****************")
print("Categorical Columns:", categorical_column)

Numerical Columns: ['hour', 'platform_os', 'yes', 'no']
****************
Categorical Columns: ['experiment', 'date', 'device_make', 'browser']


In [36]:
# Get column names have less than 10 more than 2 unique values
to_one_hot_encoding = [col for col in categorical_column if df[col].nunique() <= 10 and df[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

print("To One Hot Encoding:", to_one_hot_encoding)
print("To Label Encoding:", to_label_encoding)

To One Hot Encoding: ['date']
To Label Encoding: ['experiment', 'device_make', 'browser']


In [37]:

label_encoded_columns = []
# For loop for each columns
for col in to_label_encoding:
    # We define new label encoder to each new column
    le = LabelEncoder()
    # Encode our data and create new Dataframe of it, 
    # notice that we gave column name in "columns" arguments
    column_dataframe = pd.DataFrame(le.fit_transform(df[col]), columns=[col] )
    # and add new DataFrame to "label_encoded_columns" list
    label_encoded_columns.append(column_dataframe)

# Merge all data frames
label_encoded_columns = pd.concat(label_encoded_columns, axis=1)
label_encoded_columns

Unnamed: 0,experiment,device_make,browser
0,1,1,0
1,1,1,0
2,0,1,0
3,1,1,0
4,0,1,0
...,...,...,...
690,1,1,0
691,1,1,0
692,0,1,0
693,0,1,0


In [38]:
# We will use built in pandas function "get_dummies()" to simply to encode "to_one_hot_encoding" columns
one_hot_encoded_columns = pd.get_dummies(df[to_one_hot_encoding])
one_hot_encoded_columns

Unnamed: 0,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10
0,0,1,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
690,0,0,1,0,0,0,0,0
691,0,1,0,0,0,0,0,0
692,0,0,0,0,0,0,1,0
693,0,0,0,0,0,0,0,1


In [39]:
# Copy our DataFrame to X variable
X = df.copy()

# Droping Categorical Columns,
# "inplace" means replace our data with new one
# Don't forget to "axis=1"
X.drop(categorical_column, axis=1, inplace=True)

# Merge DataFrames
X = pd.concat([X, one_hot_encoded_columns, label_encoded_columns], axis=1)
print("All columns:", X.columns.tolist())
X

All columns: ['hour', 'platform_os', 'yes', 'no', 'date_2020-07-03', 'date_2020-07-04', 'date_2020-07-05', 'date_2020-07-06', 'date_2020-07-07', 'date_2020-07-08', 'date_2020-07-09', 'date_2020-07-10', 'experiment', 'device_make', 'browser']


Unnamed: 0,hour,platform_os,yes,no,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10,experiment,device_make,browser
0,16,6,1,0,0,1,0,0,0,0,0,0,1,1,0
1,8,6,0,1,0,0,0,1,0,0,0,0,1,1,0
2,15,6,0,1,1,0,0,0,0,0,0,0,0,1,0
3,2,6,0,1,0,0,0,0,0,0,0,1,1,1,0
4,15,6,1,0,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,21,6,1,0,0,0,1,0,0,0,0,0,1,1,0
691,1,6,1,0,0,1,0,0,0,0,0,0,1,1,0
692,7,6,0,1,0,0,0,0,0,0,1,0,0,1,0
693,16,6,0,1,0,0,0,0,0,0,0,1,0,1,0


In [None]:
y = df["yes"]

# Droping "class" from X
X.drop(["yes"], axis=1, inplace=True)
X


In [None]:
# In the first step we will split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.7)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.333)

In [None]:
### Random Forest

# Define Random Forest Model
rf = RandomForestClassifier(n_estimators=100)

# We fit our model with our train data
rf.fit(X_train, y_train)

# Then predict results from X_test data
pred_rf = rf.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test[0:10])


In [None]:

### Decision Tree

# Define Decision Tree Model
dt = DecisionTreeClassifier()
# We fit our model with our train data
dt.fit(X_train, y_train)
# Then predict results from X_test data
pred_dt = dt.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_dt[0:10])
print("Actual:", y_test[0:10])


In [None]:

### Logistic Regression

# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])


In [None]:

### Bernouilli Naive Bias

# Define Bernouilli Naive Bias Model
bnb = BernoulliNB()
# We fit our model with our train data
bnb.fit(X_train, y_train)
# Then predict results from X_test data
pred_bnb = bnb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_bnb[0:10])
print("Actual:", y_test[0:10])