In [61]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, accuracy_score, f1_score , confusion_matrix
import matplotlib.pyplot as plt 
import seaborn as sns

In [62]:
df = pd.read_excel('./dataset.xls')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.isna().sum()

Customer                0
Agent                   0
SalesAgentEmailID       0
ContactEmailID          0
Stage                   0
Product                 0
Close_Value          1208
Created Date            0
Close Date              0
dtype: int64

In [63]:
products = df.Product.unique()
for product in products:
    mean = df[df.Product == product].Close_Value.mean()
    df.loc[df[(df.Product == product) & (df.Close_Value.isna())].index, "Close_Value"] = mean
    
df.isna().sum()

Customer             0
Agent                0
SalesAgentEmailID    0
ContactEmailID       0
Stage                0
Product              0
Close_Value          0
Created Date         0
Close Date           0
dtype: int64

In [64]:
df.Stage.value_counts()

Won            3738
In Progress    2089
Lost           1973
Name: Stage, dtype: int64

In [65]:
new_df = df.copy()

one_hot_products = pd.get_dummies(new_df.Product)

new_df["Created Date"] = new_df["Created Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))
new_df["Close Date"] = new_df["Close Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))
new_df = new_df.join(one_hot_products)
new_df.drop("Product", axis=1, inplace=True)
new_df.drop(["Customer", "SalesAgentEmailID", "ContactEmailID", "Agent"], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(new_df.drop("Stage", axis=1), new_df.Stage)
model = RandomForestClassifier()
model.fit(new_df[new_df['Stage']!='In Progress'].drop("Stage", axis=1),new_df[new_df['Stage']!='In Progress'].Stage)
model.predict(new_df[new_df.Stage == "In Progress"].drop("Stage", axis=1))
df.loc[df[df.Stage == "In Progress"].index, "Stage"] = model.predict(new_df[new_df.Stage == "In Progress"].drop("Stage", axis=1))
df.Stage.value_counts()

  new_df["Created Date"] = new_df["Created Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))
  new_df["Close Date"] = new_df["Close Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))


Won     4213
Lost    3587
Name: Stage, dtype: int64

In [66]:
df

Unnamed: 0,Customer,Agent,SalesAgentEmailID,ContactEmailID,Stage,Product,Close_Value,Created Date,Close Date
0,Konex,Corliss Cosme,corliss@piedpiper.com,delila@konex.com,Won,GTX Plus Basic,1255.000000,2016-01-04,2016-05-24
1,Finjob,Rosalina Dieter,rosalina@piedpiper.com,belinda@finjob.com,Won,MG Special,45.000000,2016-01-04,2016-11-02
2,Kinnamplus,Donn Cantrell,donn@piedpiper.com,monte@kinnamplus.com,Lost,MG Special,720.801303,2016-01-06,2016-12-07
3,Genco Pura Olive Oil Company,James Ascencio,james@piedpiper.com,karole@gencopuraoliveoilcompany.com,Lost,MG Advanced,1054.000000,2016-01-06,2016-09-11
4,Stanredtax,Lajuana Vencill,lajuana@piedpiper.com,candice@stanredtax.com,Won,MG Advanced,3180.000000,2016-01-07,2016-01-31
...,...,...,...,...,...,...,...,...,...
7795,Vehement Capital Partners,Jonathan Berthelot,jonathan@piedpiper.com,lavonia@vehementcapitalpartners.com,Won,GTX Basic,590.000000,2018-12-30,2019-07-22
7796,Hottechi,Marty Freudenburg,marty@piedpiper.com,juliette@hottechi.com,Won,MG Advanced,3284.000000,2018-12-30,2019-05-26
7797,Umbrella Corporation,Anna Snelling,anna@piedpiper.com,orpha@umbrellacorporation.com,Lost,GTX Plus Pro,300.000000,2018-12-31,2019-04-15
7798,Rundofase,Gladys Colclough,gladys@piedpiper.com,crista@rundofase.com,Lost,MG Special,3284.000000,2018-12-31,2019-05-16


## Random Forest

In [67]:
one_hot_products = pd.get_dummies(df.Product)
df["Created Date"] = df["Created Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))
df["Close Date"] = df["Close Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))
df


  df["Created Date"] = df["Created Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))
  df["Close Date"] = df["Close Date"].apply(lambda x: (x - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's'))


Unnamed: 0,Customer,Agent,SalesAgentEmailID,ContactEmailID,Stage,Product,Close_Value,Created Date,Close Date
0,Konex,Corliss Cosme,corliss@piedpiper.com,delila@konex.com,Won,GTX Plus Basic,1255.000000,1.451866e+09,1.464048e+09
1,Finjob,Rosalina Dieter,rosalina@piedpiper.com,belinda@finjob.com,Won,MG Special,45.000000,1.451866e+09,1.478045e+09
2,Kinnamplus,Donn Cantrell,donn@piedpiper.com,monte@kinnamplus.com,Lost,MG Special,720.801303,1.452038e+09,1.481069e+09
3,Genco Pura Olive Oil Company,James Ascencio,james@piedpiper.com,karole@gencopuraoliveoilcompany.com,Lost,MG Advanced,1054.000000,1.452038e+09,1.473552e+09
4,Stanredtax,Lajuana Vencill,lajuana@piedpiper.com,candice@stanredtax.com,Won,MG Advanced,3180.000000,1.452125e+09,1.454198e+09
...,...,...,...,...,...,...,...,...,...
7795,Vehement Capital Partners,Jonathan Berthelot,jonathan@piedpiper.com,lavonia@vehementcapitalpartners.com,Won,GTX Basic,590.000000,1.546128e+09,1.563754e+09
7796,Hottechi,Marty Freudenburg,marty@piedpiper.com,juliette@hottechi.com,Won,MG Advanced,3284.000000,1.546128e+09,1.558829e+09
7797,Umbrella Corporation,Anna Snelling,anna@piedpiper.com,orpha@umbrellacorporation.com,Lost,GTX Plus Pro,300.000000,1.546214e+09,1.555286e+09
7798,Rundofase,Gladys Colclough,gladys@piedpiper.com,crista@rundofase.com,Lost,MG Special,3284.000000,1.546214e+09,1.557965e+09


In [68]:
df = df.join(one_hot_products)
df.drop("Product", axis=1, inplace=True)
df.drop(["Customer", "SalesAgentEmailID", "ContactEmailID", "Agent"], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df.drop("Stage", axis=1), df.Stage)



In [78]:
clf = RandomForestClassifier(class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Lost       0.94      0.86      0.90       909
         Won       0.89      0.95      0.92      1041

    accuracy                           0.91      1950
   macro avg       0.91      0.90      0.91      1950
weighted avg       0.91      0.91      0.91      1950



## decision tree


In [70]:
dsc = DecisionTreeClassifier()
dsc.fit(X_train, y_train)

DecisionTreeClassifier()

In [71]:
y_pred = dsc.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        Lost       0.88      0.88      0.88       909
         Won       0.90      0.90      0.90      1041

    accuracy                           0.89      1950
   macro avg       0.89      0.89      0.89      1950
weighted avg       0.89      0.89      0.89      1950



## k Neighbors


In [72]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Lost       0.88      0.88      0.88       909
         Won       0.90      0.90      0.90      1041

    accuracy                           0.89      1950
   macro avg       0.89      0.89      0.89      1950
weighted avg       0.89      0.89      0.89      1950



## Naive bayes

In [73]:
nbc = MultinomialNB()
nbc.fit(X_train,y_train)
y_pred = nbc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Lost       0.46      0.39      0.42       909
         Won       0.53      0.61      0.57      1041

    accuracy                           0.51      1950
   macro avg       0.50      0.50      0.50      1950
weighted avg       0.50      0.51      0.50      1950



In [74]:
print(accuracy_score(y_test,y_pred))

0.5061538461538462


In [82]:
from sklearn.metrics import precision_recall_fscore_support

_ , _ , f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
f1_score

0.9061216054362493

In [83]:
confusion_matrix(y_test, y_pred)

array([[781, 128],
       [ 53, 988]])

In [84]:
import pickle

filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))