In [None]:
# wordcloud and boost specific words?
# statsmodels, ordinary least squares, pvalue



In [None]:
import pandas as pd
import numpy as np
from string import punctuation
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


In [None]:
fname = "../data/S1_8Sharktankpitchesdeals.csv"
data = pd.read_csv(fname)
data

In [None]:
# Choose model output

# "Deal_Status"
# "Kevin O'Leary"
# "Barbara Corcoran"
# "Robert Herjavec"
# "Daymond John"
# "Kevin Harrington"
# "Mark Cuban"
# "Others"
shark_to_model = "Mark Cuban"

In [None]:
# drop rows where shark_to_model is Nan
data=data.dropna(subset=[shark_to_model])
data

In [None]:
# Custom tokenizer from class activity 22-1-4
stopwords = stopwords.words( 'english' ) + list(punctuation)
stemmer = PorterStemmer()
# Stemming
punc_list = list(punctuation)
def special_remove(word):
    if len(word)>2:
        return False
    for c in word:
        if c in punc_list:
            return True
    return False

# custom function that overrides default token generation
def custom_tokenizer(text):
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(w) for w in words if w not in stopwords+["..."]]
    # further remove words with a special char
    words = [w for w in words if not special_remove(w)]
    return words

activity_vectorizer = TfidfVectorizer(ngram_range=(1,2)
                            ,tokenizer=custom_tokenizer 
                            ,stop_words='english'
                            # ,min_df=2
                            ,strip_accents='unicode'
                            )

activity_vectors = activity_vectorizer.fit_transform(data["Pitched_Business_Desc"])
activity_df = pd.DataFrame(activity_vectors.toarray(), columns=activity_vectorizer.get_feature_names())
activity_df

In [None]:
activity_vectorizer.get_feature_names()

In [None]:
# Choose features

# To include Pitch, comment in next line, and comment out the following line
# To exclude Pitch, comment out next line, and comment in the following line
selected_features = activity_df
# selected_features = pd.DataFrame([])
selected_features["Gender"]=data["Gender"]
selected_features["Category"]=data["Category"]
selected_features["Amount_Asked_For"]=data["Amount_Asked_For"]
selected_features["Exchange_For_Stake"]=data["Exchange_For_Stake"]
selected_features["Valuation"]=data["Valuation"]
selected_features

In [None]:
selected_features = pd.get_dummies(selected_features)
selected_features

In [None]:
selected_features.columns

In [None]:
model = MultinomialNB()
X = selected_features
y = data[[shark_to_model]].values.ravel()
print(X.shape)
print(y.shape)

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2, f_classif
# #apply SelectKBest class to extract top 10 best features
# bestfeatures = SelectKBest(score_func=f_classif, k=6)
# fit = bestfeatures.fit(X,y)
# dfscores = pd.DataFrame(fit.scores_)
# dfpvalues = pd.DataFrame(fit.pvalues_)
# dfcolumns = pd.DataFrame(X.columns)
# #concat two dataframes for better visualization 
# featureScores = pd.concat([dfcolumns,dfscores,dfpvalues],axis=1)
# featureScores.columns = ['Specs','Score','PValue']  #naming the dataframe columns
# print(featureScores.nlargest(20,'Score'))  #print 10 best features

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2, f_classif
# #apply SelectKBest class to extract top 10 best features
# bestfeatures = SelectKBest(score_func=chi2, k=6)
# fit = bestfeatures.fit(X,y)
# dfscores = pd.DataFrame(fit.scores_)
# dfpvalues = pd.DataFrame(fit.pvalues_)
# dfcolumns = pd.DataFrame(X.columns)
# #concat two dataframes for better visualization 
# featureScores = pd.concat([dfcolumns,dfscores,dfpvalues],axis=1)
# featureScores.columns = ['Specs','Score','PValue']  #naming the dataframe columns
# print(featureScores.nlargest(20,'Score'))  #print 10 best features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
model.fit(X_train, y_train.ravel())

In [None]:
y_pred = model.predict(X_test)

In [None]:
pd.DataFrame({"actual": y_test.reshape(-1), "prediction": y_pred.reshape(-1)})

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
# Import ML models from sklearn
from sklearn.linear_model import LogisticRegression # Regression classifier
from sklearn.tree import DecisionTreeClassifier # Decision Tree classifier
from sklearn import svm # Support Vector Machine
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent Classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Random Forest and Gradient Boosting Classifier
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier 
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix # Some metrics to check the performance of the models

In [None]:
# Setting parameters for each algorithm - these are tunable to achieve max accuracy

Classifiers = {'LR':LogisticRegression(random_state=10,C=5,max_iter=300, solver='lbfgs')
               ,'DTC':DecisionTreeClassifier(random_state=10,min_samples_leaf=2)
               ,'RF':RandomForestClassifier(random_state=10,n_estimators=100,n_jobs=-1)
               ,'GBC':GradientBoostingClassifier(random_state=10,n_estimators=400,learning_rate=0.2)
               ,'SGD':SGDClassifier(loss="hinge", penalty="l2")
            #    ,'SVM':svm.SVC(kernel='linear', C=0.1)
               ,'NB':MultinomialNB(alpha=.05)
               }

In [None]:
Classifiers.keys()

In [None]:
# Create a pipeline so you can reuse the code

def ML_Pipeline(clf_name):
    clf = Classifiers[clf_name]
    fit = clf.fit(X_train, y_train.ravel())
    y_pred = clf.predict(X_test)
    Accuracy = accuracy_score(y_test,y_pred)
    Precision = metrics.precision_score(y_test,y_pred)
    Recall = recall_score(y_test,y_pred)
    Confusion_matrix = confusion_matrix(y_test,y_pred)
    print('Classifier = '+(clf_name))
    print('Accuracy = '+str(Accuracy))
    print('Precision = '+str(Precision))
    print('Recall = '+str(Recall))
    print(Confusion_matrix)
    print('==='*20) 
    return clf

In [None]:
clf_dict = []
for clf_name in Classifiers.keys():
    clf = Classifiers[clf_name]
    fit = clf.fit(X_train, y_train.ravel())
    y_pred = clf.predict(X_test)
    Accuracy = accuracy_score(y_test,y_pred)
    Precision = metrics.precision_score(y_test,y_pred)
    Recall = recall_score(y_test,y_pred)
    Confusion_matrix = confusion_matrix(y_test,y_pred)
    clf_dict.append({'Classifier': clf_name, 'Accuracy': Accuracy, 'Precision': Precision, 'Recall': Recall})
    print('Classifier = '+(clf_name))
    print('Accuracy = '+str(Accuracy))
    print('Precision = '+str(Precision))
    print('Recall = '+str(Recall))
    print(Confusion_matrix)
    print('==='*20)


In [None]:
clf_df = pd.DataFrame(clf_dict)
clf_df

In [None]:
clf_df['Accuracy'].max()

In [None]:
best_clf = clf_df.loc[clf_df['Accuracy'].idxmax(),'Classifier']
best_clf

In [None]:
best_model = ML_Pipeline(best_clf)
best_model


In [None]:
from sklearn.externals import joblib
joblib.dump(best_model, f"{shark_to_model}_model.pkl")

In [None]:
import pickle
pickle.dump(selected_features.columns,open(f"{shark_to_model}_vocab.pkl","wb"))