# code runs!<br>
do final clean and add ROC

In [1]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from tqdm import tqdm

import json
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn import tree

In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive') 
# path = '/content/gdrive/MyDrive/IFT 6390/kaggle2/'

In [3]:
tqdm.pandas()

### read and prep data

In [4]:
# input_path = path+'data/input/'
input_path = 'data/input/'

In [5]:
all_train_data = pd.read_csv(input_path + 'train_data_prepped.csv').fillna('')
all_test_data = pd.read_csv(input_path + 'test_data_prepped.csv').fillna('')

In [6]:
train_target = pd.read_csv(input_path + 'train_results.csv')
train_target['target'].value_counts()

positive    520436
negative    519803
neutral         84
Name: target, dtype: int64

In [7]:
train_target['target'] = [0 if t == 'negative' else 2 if t == 'positive' else 2 for t in train_target['target'].values]

In [8]:
all_train_data['target'] = train_target['target']

In [9]:
all_train_data['target'].value_counts()

2    520520
0    519803
Name: target, dtype: int64

### establish parameters

In [10]:
vocab_sizes = [100, 1_000, 10_000]
ngrams = [(1, 1), (2, 2), (1, 2), (1, 3)]
text_columns = ['text_no_numerals', 'text_no_sw', 'text_porter_stemmed', 'text_lancaster_stemmed', 'text_lemmatized']
vectorizers = [TfidfVectorizer, CountVectorizer]
models = [DecisionTreeClassifier()]


set decision tree parameters

In [11]:
param_grid = {
    'criterion': ['gini',],
    'max_depth': [3,5,7],
    'max_features': ['sqrt', 'log2', None],
    
}

In [12]:
def trainModels(train_data, targets, vocab_sizes, text_columns, ngrams, vectorizers, models, param_grid):
    param_tracker = []
    predictions_tracker = []
    counter = 1
    for size in tqdm(vocab_sizes):


        for col in text_columns:

            text = train_data[col]
            for ng in ngrams:

                for vectorizer in vectorizers:
                    vectorizer_name = str(vectorizer).split('.')[-1].replace('>', '').replace("\'",'').strip()
                    vectorizer = vectorizer(max_features=size, ngram_range=ng)
                    BOW = vectorizer.fit_transform(text)
                    BOW_array = BOW.toarray()
                    print('BOW shape: ', BOW_array.shape)

                    X_train, X_valid, y_train, y_valid = train_test_split(BOW_array, targets, test_size=0.2, random_state=42)


                    for model in models:
                        
                        model_name = str(model)
                        counter+=1
                        if counter % 10==0:
                            print(f'{counter}')

                        grid = RandomizedSearchCV(estimator = model,
                                         param_distributions=param_grid,
                                         scoring = 'accuracy',
                                         cv=3,
                                         n_iter=5,
                                         verbose=0)

                        grid.fit(X_train, y_train)

                        accuracy = grid.score(X_valid, y_valid)

                        grid_params = grid.best_params_
                        criterion = grid_params['criterion']
                        max_depth = grid_params['max_depth']
                        max_features = grid_params['max_features']
                        print('accuracy: ', accuracy, '\n')

                        model_params = {

                            'vocab_size': size,
                            'model_name': model_name,
                            'text_column': col,
                            'ngram': ng,
                            'vectorizer': vectorizer_name,
                            'splitting_criterion': criterion,
                            'tree_max_depth': max_depth,
                            'tree_max_features': max_features,
                            'accuracy': accuracy


                        }


                        param_tracker.append(model_params)



            

    return param_tracker



In [13]:
sample_size=5000
v1_data = all_train_data.sample(sample_size)
v1_train_X = v1_data.iloc[:, :-1]
v1_train_y  = v1_data.iloc[:, -1]

In [14]:
param_tracker = trainModels(v1_train_X, v1_train_y, vocab_sizes, text_columns, ngrams, vectorizers, models, param_grid)

  0%|                                                                              | 0/3 [00:00<?, ?it/s]

BOW shape:  (5000, 100)
accuracy:  0.613 

BOW shape:  (5000, 100)
accuracy:  0.609 

BOW shape:  (5000, 100)
accuracy:  0.539 

BOW shape:  (5000, 100)
accuracy:  0.554 

BOW shape:  (5000, 100)
accuracy:  0.61 

BOW shape:  (5000, 100)
accuracy:  0.553 

BOW shape:  (5000, 100)
accuracy:  0.574 

BOW shape:  (5000, 100)
accuracy:  0.576 

BOW shape:  (5000, 100)
10
accuracy:  0.58 

BOW shape:  (5000, 100)
accuracy:  0.574 

BOW shape:  (5000, 100)
accuracy:  0.534 

BOW shape:  (5000, 100)
accuracy:  0.541 

BOW shape:  (5000, 100)
accuracy:  0.581 

BOW shape:  (5000, 100)
accuracy:  0.523 

BOW shape:  (5000, 100)
accuracy:  0.58 

BOW shape:  (5000, 100)
accuracy:  0.575 

BOW shape:  (5000, 100)
accuracy:  0.61 

BOW shape:  (5000, 100)
accuracy:  0.604 

BOW shape:  (5000, 100)
20
accuracy:  0.549 

BOW shape:  (5000, 100)
accuracy:  0.551 

BOW shape:  (5000, 100)
accuracy:  0.599 

BOW shape:  (5000, 100)
accuracy:  0.596 

BOW shape:  (5000, 100)
accuracy:  0.598 

BOW shape

 33%|███████████████████████▎                                              | 1/3 [00:12<00:25, 12.63s/it]

BOW shape:  (5000, 100)
accuracy:  0.612 

BOW shape:  (5000, 1000)
accuracy:  0.57 

BOW shape:  (5000, 1000)
accuracy:  0.603 

BOW shape:  (5000, 1000)
accuracy:  0.548 

BOW shape:  (5000, 1000)
accuracy:  0.555 

BOW shape:  (5000, 1000)
accuracy:  0.618 

BOW shape:  (5000, 1000)
accuracy:  0.605 

BOW shape:  (5000, 1000)
accuracy:  0.58 

BOW shape:  (5000, 1000)
accuracy:  0.605 

BOW shape:  (5000, 1000)
50
accuracy:  0.498 

BOW shape:  (5000, 1000)
accuracy:  0.578 

BOW shape:  (5000, 1000)
accuracy:  0.541 

BOW shape:  (5000, 1000)
accuracy:  0.543 

BOW shape:  (5000, 1000)
accuracy:  0.517 

BOW shape:  (5000, 1000)
accuracy:  0.578 

BOW shape:  (5000, 1000)
accuracy:  0.558 

BOW shape:  (5000, 1000)
accuracy:  0.579 

BOW shape:  (5000, 1000)
accuracy:  0.611 

BOW shape:  (5000, 1000)
accuracy:  0.624 

BOW shape:  (5000, 1000)
60
accuracy:  0.551 

BOW shape:  (5000, 1000)
accuracy:  0.551 

BOW shape:  (5000, 1000)
accuracy:  0.596 

BOW shape:  (5000, 1000)
accu

 67%|██████████████████████████████████████████████▋                       | 2/3 [01:08<00:37, 37.84s/it]

accuracy:  0.605 

BOW shape:  (5000, 9211)
accuracy:  0.567 

BOW shape:  (5000, 9211)
accuracy:  0.609 

BOW shape:  (5000, 10000)
accuracy:  0.551 

BOW shape:  (5000, 10000)
accuracy:  0.556 

BOW shape:  (5000, 10000)
accuracy:  0.565 

BOW shape:  (5000, 10000)
accuracy:  0.603 

BOW shape:  (5000, 10000)
accuracy:  0.602 

BOW shape:  (5000, 10000)
accuracy:  0.607 

BOW shape:  (5000, 9122)
90
accuracy:  0.585 

BOW shape:  (5000, 9122)
accuracy:  0.516 

BOW shape:  (5000, 10000)
accuracy:  0.543 

BOW shape:  (5000, 10000)
accuracy:  0.543 

BOW shape:  (5000, 10000)
accuracy:  0.586 

BOW shape:  (5000, 10000)
accuracy:  0.559 

BOW shape:  (5000, 10000)
accuracy:  0.583 

BOW shape:  (5000, 10000)
accuracy:  0.515 

BOW shape:  (5000, 7922)
accuracy:  0.54 

BOW shape:  (5000, 7922)
accuracy:  0.589 

BOW shape:  (5000, 10000)
100
accuracy:  0.547 

BOW shape:  (5000, 10000)
accuracy:  0.551 

BOW shape:  (5000, 10000)
accuracy:  0.596 

BOW shape:  (5000, 10000)
accuracy: 

100%|█████████████████████████████████████████████████████████████████████| 3/3 [15:31<00:00, 310.48s/it]

accuracy:  0.61 






In [15]:
param_df = pd.DataFrame(param_tracker,)
param_df = param_df.sort_values(by='accuracy', ascending=False).reset_index(drop=True)


In [16]:
param_df

Unnamed: 0,vocab_size,model_name,text_column,ngram,vectorizer,splitting_criterion,tree_max_depth,tree_max_features,accuracy
0,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 3)",CountVectorizer,gini,7,,0.629
1,1000,DecisionTreeClassifier(),text_lancaster_stemmed,"(1, 2)",CountVectorizer,gini,7,,0.628
2,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,7,,0.628
3,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 1)",CountVectorizer,gini,7,,0.624
4,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 3)",CountVectorizer,gini,7,,0.624
...,...,...,...,...,...,...,...,...,...
115,1000,DecisionTreeClassifier(),text_no_sw,"(1, 1)",TfidfVectorizer,gini,3,,0.498
116,1000,DecisionTreeClassifier(),text_lemmatized,"(2, 2)",TfidfVectorizer,gini,5,,0.486
117,100,DecisionTreeClassifier(),text_lancaster_stemmed,"(2, 2)",TfidfVectorizer,gini,5,sqrt,0.486
118,100,DecisionTreeClassifier(),text_lemmatized,"(2, 2)",TfidfVectorizer,gini,5,,0.485


saving and loading results

In [17]:
param_df.to_csv('data/output/decision_tree/decision_tree_params.csv', index=False)


In [18]:
param_df = pd.read_csv('data/output/decision_tree/decision_tree_params.csv')
param_df['ngram'] = [(int(ng[1]), int(ng[4])) for ng in param_df['ngram'].values]
param_df = param_df.replace({np.nan: None})
param_df.head(3)


Unnamed: 0,vocab_size,model_name,text_column,ngram,vectorizer,splitting_criterion,tree_max_depth,tree_max_features,accuracy
0,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 3)",CountVectorizer,gini,7,,0.629
1,1000,DecisionTreeClassifier(),text_lancaster_stemmed,"(1, 2)",CountVectorizer,gini,7,,0.628
2,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,7,,0.628


get best 20 combinations and train on more data

In [19]:
top_10_params = param_df[:10]

# for cross validation, num folds   
k = 3

In [20]:
sample_size=50_000
v2_data = all_train_data.sample(sample_size).reset_index(drop=True)
v2_train_X = v2_data.iloc[:, :-1]
v2_train_y  = v2_data.iloc[:, -1]

In [25]:

model_accuracy = []
predictions = []
for i, param_set in enumerate(tqdm(top_10_params.values)):
    (vocab_size, 
     model_type, 
     text_column, 
     ngram, 
     vectorizer, 
     criterion, 
     max_depth,
     max_features, 
     *_) = param_set
    

    
    print(vocab_size, model_type, text_column, ngram, vectorizer, criterion, max_depth, max_features)
    print('\n')
    
    if vocab_size:
        vocab_size = int(vocab_size)
        
        
    if vectorizer == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer
    elif vectorizer == 'CountVectorizer':
        vectorizer = CountVectorizer

    else:
        print('warning, unrecognized vectorizer!')
        print(i, param_set)
    
    print('vectorizing')
    model_vectorizer = vectorizer(max_features=vocab_size, ngram_range=ngram)

    print('creating BOW')
    
    BOW = model_vectorizer.fit_transform(v2_train_X[text_column])
    
    
    kfolds = KFold(n_splits=k)
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, max_features=max_features)
    
    acc_score = []
    for train_index , valid_index in kfolds.split(BOW):
        X_train , X_test = BOW[train_index,:],BOW[valid_index,:]
        y_train , y_test = v2_train_y[train_index] , v2_train_y[valid_index]
        
        print('fitting fold')
        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)

        acc = accuracy_score(pred_values , y_test)
        print('fold acc: ', acc)
        acc_score.append(acc)
        
        pred_probs = model.predict_proba(X_test)

    mean_acc = sum(acc_score)/len(acc_score)
    
    print(mean_acc, '\n\n')
    model_accuracy.append(mean_acc)
    predictions.append(pred_probs)
    


  0%|                                                                             | 0/10 [00:00<?, ?it/s]

10000 DecisionTreeClassifier() text_lancaster_stemmed (1, 3) TfidfVectorizer gini 7 None


vectorizing
creating BOW
fitting fold
fold acc:  0.5839683206335873
fitting fold
fold acc:  0.5859482810343793
fitting fold


 10%|██████▉                                                              | 1/10 [00:07<01:10,  7.85s/it]

fold acc:  0.5853834153366134
0.5851000056681933 


1000 DecisionTreeClassifier() text_lancaster_stemmed (1, 3) TfidfVectorizer gini 7 None


vectorizing
creating BOW


 10%|██████▉                                                              | 1/10 [00:12<01:54, 12.74s/it]


KeyboardInterrupt: 

In [22]:
model_accuracy

[0.580199812060401,
 0.5827399016642252,
 0.5800798168604011,
 0.5798598056600012,
 0.579879822060345,
 0.5852400580693534,
 0.5852800068683615,
 0.5812599612642332,
 0.5840400472681774,
 0.5804600768659053]

In [23]:
top_10_params['acc2'] = model_accuracy
top_10_params = top_10_params.sort_values(by='acc2', ascending=False).reset_index(drop=True)
top_10_params

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,vocab_size,model_name,text_column,ngram,vectorizer,splitting_criterion,tree_max_depth,tree_max_features,accuracy,acc2
0,10000,DecisionTreeClassifier(),text_lancaster_stemmed,"(1, 3)",TfidfVectorizer,gini,7,,0.62,0.58528
1,1000,DecisionTreeClassifier(),text_lancaster_stemmed,"(1, 3)",TfidfVectorizer,gini,7,,0.623,0.58524
2,1000,DecisionTreeClassifier(),text_lancaster_stemmed,"(1, 2)",TfidfVectorizer,gini,7,,0.619,0.58404
3,1000,DecisionTreeClassifier(),text_lancaster_stemmed,"(1, 2)",CountVectorizer,gini,7,,0.628,0.58274
4,1000,DecisionTreeClassifier(),text_lemmatized,"(1, 1)",TfidfVectorizer,gini,7,,0.62,0.58126
5,1000,DecisionTreeClassifier(),text_no_numerals,"(1, 2)",TfidfVectorizer,gini,7,,0.618,0.58046
6,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 3)",CountVectorizer,gini,7,,0.629,0.5802
7,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,7,,0.628,0.58008
8,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 3)",CountVectorizer,gini,7,,0.624,0.57988
9,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 1)",CountVectorizer,gini,7,,0.624,0.57986


save and load results

In [24]:
top_10_params.to_csv('data/output/decision_tree/decision_tree_top_10_params.csv', index=False)

In [28]:
top_10_params = pd.read_csv('data/output/decision_tree/decision_tree_top_10_params.csv')
top_10_params['ngram'] = [(int(ng[1]), int(ng[4])) for ng in top_10_params['ngram'].values]
top_10_params = top_10_params.replace({np.nan: None})
top_10_params.head(3)

Unnamed: 0,vocab_size,model_name,text_column,ngram,vectorizer,splitting_criterion,tree_max_depth,tree_max_features,accuracy,acc2
0,1000,DecisionTreeClassifier(),text_lemmatized,"(1, 2)",CountVectorizer,gini,7,,0.617,0.59054
1,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 3)",TfidfVectorizer,gini,7,,0.616,0.59012
2,100,DecisionTreeClassifier(),text_porter_stemmed,"(1, 3)",CountVectorizer,gini,7,,0.611,0.58976


repeat tuning one last time with top 3 parameter combos and more data

In [None]:
top_3_params = top_10_params[:3]

In [None]:
sample_size=200_000
k=3
v3_data = all_train_data.sample(sample_size).reset_index(drop=True)
v3_train_X = v3_data.iloc[:, :-1]
v3_train_y  = v3_data.iloc[:, -1]

In [None]:

model_accuracy = []
top_3_predictions = []
for i, param_set in enumerate(tqdm(top_3_params.values)):
    (vocab_size, 
     model_type, 
     text_column, 
     ngram, 
     vectorizer, 
     criterion, 
     max_depth,
     max_features, 
     *_) = param_set
    

    
    print(vocab_size, model_type, text_column, ngram, vectorizer, criterion, max_depth, max_features)
    
    if vocab_size:
        vocab_size = int(vocab_size)
        
        
    if vectorizer == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer
    elif vectorizer == 'CountVectorizer':
        vectorizer = CountVectorizer

    else:
        print('warning, unrecognized vectorizer!')
        print(i, param_set)
    
    print('vectorizing')
    model_vectorizer = vectorizer(max_features=vocab_size, ngram_range=ngram)

    print('creating BOW')
    
    BOW = model_vectorizer.fit_transform(v3_train_X[text_column])
    BOW_array = BOW
    print('splitting train-validation data')
    


    

    kfolds = KFold(n_splits=k)
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, max_features=max_features)
    
    acc_score = []
    for train_index , valid_index in kfolds.split(BOW_array):
        X_train , X_test = BOW_array[train_index,:],BOW_array[valid_index,:]
        y_train , y_test = v3_train_y[train_index] , v3_train_y[valid_index]
        
        model.fit(X_train, y_train)
        pred_values = model.predict(X_test)

        acc = accuracy_score(pred_values , y_test)
        acc_score.append(acc)
        
        pred_probs = model.predict_proba(X_test)

    mean_acc = sum(acc_score)/len(acc_score)
    # mean_probs = 
    
    print(mean_acc, '\n\n')
    model_accuracy.append(mean_acc)
    top_3_predictions.append(pred_probs)
    


In [None]:
top_3_params['acc3'] = model_accuracy
top_3_params = top_3_params.sort_values(by='acc3', ascending=False).reset_index(drop=True)
top_3_params

In [None]:
top_3_params.to_csv(path+'data/output/decision_tree/decision_tree_top_3_params.csv', index=False)

### train final model on best params

In [None]:

vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 1))


BOW = vectorizer.fit_transform(all_train_data['text_lancaster_stemmed'])




the validation data will be used to generate the ROC curve

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(BOW, all_train_data['target'], test_size=0.2, random_state=42)


In [None]:
model = DecisionTreeClassifier(criterion='gini', max_depth=5, max_features=None)

In [None]:
model.fit(X_train, y_train)

transform test data

In [None]:
test_BOW = vectorizer.transform(all_test_data['text_lancaster_stemmed'])


In [None]:
prediction_values = model.predict(test_BOW)


In [None]:
prediction_probs = model.predict_proba(test_BOW)

In [None]:
pos_predictions = [pred[-1] for pred in prediction_probs]

In [None]:
thresh = 0.5

In [None]:
bin_pred = []
for pred in pos_predictions:
    if pred >= thresh:
        binary_pred = 2
    else:
        binary_pred = 0
    bin_pred.append(binary_pred)

In [None]:
pdf = pd.DataFrame(bin_pred).reset_index()
pdf.columns = ['id', 'target']

In [None]:
pdf['target'].value_counts()

In [None]:
pdf.to_csv('data/output/decision_tree/decision_tree_predictions.csv', index=False)

### generate ROC curve

In [None]:
valid_predictions = model.predict_proba(X_valid)

In [None]:
model.classes_

In [None]:
valid_predictions = [p[1] for p in valid_predictions]

In [None]:
fpr, tpr, thresholds = roc_curve(y_valid, valid_predictions, pos_label=2)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.savefig('data/output/decision_tree/ROC.png')
plt.show()

## visualizing model decisions

In [None]:
text_representation = tree.export_text(model)

In [None]:
print(text_representation)

In [None]:
g = tree.export_graphviz(model)

In [None]:
print(g)

In [None]:
type(g)

In [None]:
import json

In [None]:
tree_plot = tree.plot_tree(model)

In [None]:
json

In [None]:
acc = accuracy_score(model.predict(X_valid) , y_valid)
acc

In [None]:
y_valid

In [None]:
print(text_representation)

In [None]:
text_representation[:3]