In [3]:
#Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

#Classifier libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
#Need to pip install xgboost
# import xgboost as xgb
import collections

In [10]:
#Import and manipulate dataset here
#Dataset credit goes to https://github.com/lostincalibasas 

journey = pd.read_csv('/Users/chelseavalencia/Desktop/customer_journey.csv')

In [18]:
#Show first five rows of dataset
journey.head()

#Find number of unique IDs
journey.user_id.nunique()

#Average session per user
journey.user_id.count() / journey.user_id.nunique()

7.359259259259259

In [5]:
#Create dataframe for two populations; this may not be needed with the new dataset
conversion = summary_df[summary_df['is_conversion'] == 1]
non_conversion = summary_df[summary_df['is_conversion'] == 0]

#Get count of conversions
conversion.count()

#Find unique IDs in non_conversion dataframe. 
unique_ids = non_conversion['id'].unique()

#Set seed to reproduce results
np.random.seed(42)

#Take a sample of IDs from the unique_id
sample_size = 1000
sampled_ids = np.random.choice(unique_ids, size = sample_size, replace = False)

#Get sampled dataset
sample_non_conversion = summary_df[summary_df['id'].isin(sampled_ids)]

#Create final dataframe
prospects = pd.concat([conversion, non_conversion], ignore_index = True)

NameError: name 'summary_df' is not defined

In [None]:
#Verify equal classes
colors = []

plt.ticklabel_format(style = 'plain')
sns.countplot(x = 'is_conversion', data = prospects, palette = colors)
plt.title('Class Distributions', font_size = 14)

plt.show

In [None]:
#Define independent and dependent variables
X = prospects['column_names']
y = prospects['column_names']

In [None]:
#Check for correlation between the variables
f, (ax1) = plt.subplots(1,1, figsize(15,5))

prospects_corr = X.corr()

sns.heatmap(prospects_corr, cmap = 'coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Correlation Matrix", font_size = 14)

plt.show()

In [None]:
#Separate into train, test, split for models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
#Instantiate Logistic Regression model
model_logreg = LogisticRegression()

#Fit model with training data
model_logreg.fit(X_train, y_train)

#Predict probabilities
y_pred_prob = model_logreg.predict(X_test)

In [None]:
#Cross validation
cv_scores = cross_val_score(model_logreg, X_train, y_train, cv=10)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

In [None]:
#Classification report
print(classification_report(y_test, y_pred_prob))

In [None]:
#Coefficients and Odds Ratio
coefficients = model_logreg.coef_[0]
odds_ratio = np.exp(coefficients)

#Display feature importance using coefficients and odds ratio
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Odds Ratio': odds_ratios
})

print("\nFeature Importance (Coefficient and Odds Ratio):")
print(feature_importance.sort_values(by='Coefficient', ascending = False))

In [None]:
#Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

#Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], 'k--', label = 'No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for MQL Classification')
plt.legend()
plt.show()

In [None]:
#Decision tree comparison

#Creating classifier object
model_decision_tree = DecisionTreeClassifier(criterion="gini", random_state = 100, max_depth=3, min_samples_leaf=5)

#Train model
decision_tree_model.fit(X_train, y_train)

#Predict response
y_pred = decision_tree_model.predict(X_test)

#Placeholder function for cal_accuracy
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

#Cross validation
cv_scores_dt = cross_val_score(decision_tree_model, X_train, y_train, cv=10)

print("Cross-validation scores:", cv_scores_dt)
print("Mean cross-validation score:", cv_scores_dt.mean())

In [None]:
#Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

#Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], 'k--', label = 'No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for MQL Classification')
plt.legend()
plt.show()

In [None]:
#Random Forest Classifier

#Instantiate Model
model_random_forest = RandomForestClassifier(n_estimators = 100)

#Performing training
model_random_forest.fit(X_train, y_train)

#Predict response
y_pred_rf = model_random_forest.predict(X_test)

#Calculate accuracy
print(confusion_matrix(y_test, y_pred_rf))
print(accuracy_score(y_test, y_pred_rf)*100)
print(classification_report(y_test, y_pred_rf))

In [None]:
#Determine feature importance
feature_names = X.columns
importances = model_random_forest.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df)

In [None]:
#Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

#Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], 'k--', label = 'No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for MQL Classification')
plt.legend()
plt.show()

In [None]:
#XGBoost Classifier
model_xg = xgb.XGBClassifier(objective='multi:softmax', num_class=3)

#Perform training
model_xg.fit(X_train, y_train)

#Predict reponse
y_pred_xg = random_forest_model.predict(X_test)

#Calculate accuracy
print(confusion_matrix(y_test, y_pred_xg))
print(accuracy_score(y_test, y_pred_xg)*100)
print(classification_report(y_test, y_pred_xg))

In [None]:
#Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

#Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], 'k--', label = 'No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for MQL Classification')
plt.legend()
plt.show()

In [None]:
#Isolation Forest for analmoly detection
clf = IsolationForest(contamination=0.1)

clf.fit(X_train)

y_pred_if = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred_if))
print(accuracy_score(y_test, y_pred_if)*100)
print(classification_report(y_test, y_pred_if))