In [1]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from tqdm import tqdm

import json
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score

In [2]:
tqdm.pandas()

In [3]:
input_path = 'data/input/'

In [4]:
all_train_data = pd.read_csv(input_path + 'train_data_prepped.csv').fillna('')
all_test_data = pd.read_csv(input_path + 'test_data_prepped.csv').fillna('')

In [5]:
train_target = pd.read_csv(input_path + 'train_results.csv')
train_target['target'].value_counts()

positive    520436
negative    519803
neutral         84
Name: target, dtype: int64

In [6]:
train_target['target'] = [0 if t == 'negative' else 2 if t == 'positive' else 2 for t in train_target['target'].values]

In [7]:
all_train_data['target'] = train_target['target']

In [8]:
all_train_data['target'].value_counts()

2    520520
0    519803
Name: target, dtype: int64

set text parameters


In [9]:
train_X = all_train_data.iloc[:, :-1]
train_y  = all_train_data.iloc[:, -1]

In [10]:
train_X.head(2)

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized
0,0,anyway im getting of for a while,train,anyway im getting of for a while,anyway im getting of for a while,anyway im getting,anyway im get,anyway im get,anyway im getting
1,1,"my red, apache isn't feelin too well this morn...",train,my red apache isnt feelin too well this mornin...,my red apache isnt feelin too well this mornin...,red apache feelin well morning httpmypictmen,red apach feelin well morn httpmypictmen,red apach feelin wel morn httpmypictmen,red apache feelin well morning httpmypictmen


In [11]:
train_y.head(2)

0    2
1    0
Name: target, dtype: int64

In [12]:
vocab_sizes = [100, 1_000, 10_000]
ngrams = [(1, 1), (2, 2), (1, 2), (1, 3)]
text_columns = ['text_no_numerals', 'text_no_sw', 'text_porter_stemmed', 'text_lancaster_stemmed', 'text_lemmatized']
vectorizers = [TfidfVectorizer, CountVectorizer]
models = [DecisionTreeClassifier()]


set decision tree parameters

In [13]:
param_grid = {
    'criterion': ['gini',],
    'max_depth': [5, 10, 30],
    'max_features': ['sqrt', 'log2'],
    
}

In [14]:
def trainModels(df, targets, vocab_sizes, text_columns, ngrams, vectorizers, models, param_grid):
    param_tracker = []
    predictions_tracker = []
    counter = 1
    for size in tqdm(vocab_sizes):
#         print(f'vocab size: {size}')


        for col in text_columns:
#                 print(f'processing method: {col}')


            for ng in ngrams:

                for vectorizer in vectorizers:
                    for model in models:
                        vectorizer_name = str(vectorizer).split('.')[-1].replace('>', '').replace("\'",'').strip()
                        model_name = str(model)
                        counter+=1
                        if counter % 10==0:
                            print(f'{counter}')
                        vectorizer = vectorizer(max_features=size, ngram_range=ng)

                        BOW = vectorizer.fit_transform(df[col])
                        BOW_array = BOW.toarray()

                        X_train, X_valid, y_train, y_valid = train_test_split(BOW_array, targets, test_size=0.2, random_state=42)

#                         print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)
#                         print(X_train[0])
#                         print(y_train)

                        grid = RandomizedSearchCV(estimator = model,
                                         param_distributions=param_grid,
                                         scoring = 'accuracy',
                                         cv=3,
                                         n_iter=5,
                                         verbose=0)

                        grid.fit(X_train, y_train)

                        accuracy = grid.score(X_valid, y_valid)

                        grid_params = grid.best_params_
#                         print(grid_params)
                        criterion = grid_params['criterion']
                        max_depth = grid_params['max_depth']
                        max_features = grid_params['max_features']
    #                     print(grid_params)
#                         print('accuracy: ', accuracy, '\n')

                        model_params = {

                            'vocab_size': size,
                            'model_name': model_name,
                            'text_column': col,
                            'ngram': ng,
                            'vectorizer': vectorizer_name,
                            'splitting_criterion': criterion,
                            'tree_max_depth': max_depth,
                            'tree_max_features': max_features,
                            'accuracy': accuracy


                        }
    
#                         print(model_params)


                        param_tracker.append(model_params)


            

    return param_tracker



In [15]:
param_tracker = trainModels(train_X[:5000], train_y[:5000], vocab_sizes, text_columns, ngrams, vectorizers, models, param_grid)

  0%|                                                                                                                                                                    | 0/3 [00:00<?, ?it/s]

10
20
30
40


 33%|████████████████████████████████████████████████████                                                                                                        | 1/3 [00:10<00:21, 10.75s/it]

50
60
70
80


 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                    | 2/3 [01:53<01:04, 64.71s/it]

90
100
110
120


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [34:26<00:00, 688.91s/it]


In [16]:
param_df = pd.DataFrame(param_tracker,)
param_df = param_df.sort_values(by='accuracy', ascending=False).reset_index(drop=True)
# param_df

Unnamed: 0,vocab_size,model_name,text_column,ngram,vectorizer,splitting_criterion,tree_max_depth,tree_max_features,accuracy
0,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,30,,0.644
1,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,30,,0.644
2,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 1)",CountVectorizer,gini,30,,0.640
3,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 1)",TfidfVectorizer,gini,30,,0.638
4,10000,DecisionTreeClassifier(),text_lancaster_stemmed,"(1, 2)",CountVectorizer,gini,30,,0.638
...,...,...,...,...,...,...,...,...,...
115,10000,DecisionTreeClassifier(),text_lancaster_stemmed,"(2, 2)",CountVectorizer,gini,30,sqrt,0.529
116,10000,DecisionTreeClassifier(),text_no_sw,"(2, 2)",TfidfVectorizer,gini,10,,0.527
117,1000,DecisionTreeClassifier(),text_lemmatized,"(2, 2)",CountVectorizer,gini,30,sqrt,0.525
118,1000,DecisionTreeClassifier(),text_no_sw,"(1, 2)",CountVectorizer,gini,30,log2,0.524


In [17]:
# param_df.iloc[0]['vectorizer'].split('.')[-1].replace('>', '').replace("\'",'').strip()

In [15]:
# param_df.to_csv('decision_tree_params.csv', index=False)


Unnamed: 0,vocab_size,model_name,text_column,ngram,vectorizer,splitting_criterion,tree_max_depth,tree_max_features,accuracy
0,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,30,,0.644
1,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,30,,0.644
2,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 1)",CountVectorizer,gini,30,,0.64


In [24]:
param_df = pd.read_csv('decision_tree_params.csv')
param_df['ngram'] = [(int(ng[1]), int(ng[4])) for ng in param_df['ngram'].values]
param_df = param_df.replace({np.nan: None})
param_df.head(3)

Unnamed: 0,vocab_size,model_name,text_column,ngram,vectorizer,splitting_criterion,tree_max_depth,tree_max_features,accuracy
0,1000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,30,,0.644
1,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 2)",CountVectorizer,gini,30,,0.644
2,10000,DecisionTreeClassifier(),text_porter_stemmed,"(1, 1)",CountVectorizer,gini,30,,0.64


In [25]:
top_20_params = param_df[:20]
        
k = 3

In [None]:

model_accuracy = []
predictions = []
for i, param_set in enumerate(tqdm(top_20_params.values)):
    (vocab_size, 
     model_type, 
     text_column, 
     ngram, 
     vectorizer, 
     criterion, 
     max_depth,
     max_features, 
     *_) = param_set
    
    
    #     'criterion': ['gini', 'entropy',],
    #     'max_depth': [5, 10, 50],
    #     'max_features': ['sqrt', 'log2', None],
    
    
    print(vocab_size, model_type, text_column, ngram, vectorizer, criterion, max_depth, max_features)
    
    if vocab_size:
        vocab_size = int(vocab_size)
        
    # vectorizers = [TfidfVectorizer, CountVectorizer]
        
    if vectorizer == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer
    elif vectorizer == 'CountVectorizer':
        vectorizer = CountVectorizer

    else:
        print('warning, unrecognized vectorizer!')
        print(i, param_set)
    
    print('vectorizing')
    model_vectorizer = vectorizer(max_features=vocab_size, ngram_range=ngram)

    print('creating BOW')
    
    BOW = model_vectorizer.fit_transform(train_X[text_column])
    BOW_array = BOW.toarray()
    
    print('splitting train-validation data')
    


    

    kfolds = KFold(n_splits=k)
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, max_features=max_features)
    
    acc_score = []
    for train_index , valid_index in kfolds.split(BOW_array):
        X_train , X_test = BOW_array[train_index,:],BOW_array[valid_index,:]
        y_train , y_test = train_y[train_index] , train_y[valid_index]
        
        model.fit(X_train[:20_000],y_train[:20_000])
        pred_values = model.predict(X_test)

        acc = accuracy_score(pred_values , y_test)
        acc_score.append(acc)
        
        pred_probs = model.predict_proba(X_test)

    mean_acc = sum(acc_score)/len(acc_score)
    
    print(mean_acc, '\n\n')
    model_accuracy.append(mean_acc)
    predictions.append(pred_probs)
    
    
    

    
    
  
    
    

  0%|                                                                                                                                                                   | 0/20 [00:00<?, ?it/s]

1000 DecisionTreeClassifier() text_porter_stemmed (1, 2) CountVectorizer gini 30 None
vectorizing
creating BOW
splitting train-validation data


  5%|███████▋                                                                                                                                                  | 1/20 [02:10<41:16, 130.34s/it]

0.6457359868231466 


10000 DecisionTreeClassifier() text_porter_stemmed (1, 2) CountVectorizer gini 30 None
vectorizing
creating BOW
splitting train-validation data


In [None]:
model_accuracy

In [None]:
top_20_params['acc2'] = model_accuracy
top_20_params = top_20_params.sort_values(by='acc2', ascending=False).reset_index(drop=True)
top_20_params

In [None]:
top_20_params.to_csv('decision_tree_top_20_params.csv', index=False)

repeat tuning one last time with top 3 parameter combos and more data

In [None]:
top_3_params = top_20_params[:3]

In [None]:

model_accuracy = []
top_3_predictions = []
for i, param_set in enumerate(tqdm(top_3_params.values)):
    (vocab_size, 
     model_type, 
     text_column, 
     ngram, 
     vectorizer, 
     criterion, 
     max_depth,
     max_features, 
     *_) = param_set
    
    
    #     'criterion': ['gini', 'entropy',],
    #     'max_depth': [5, 10, 50],
    #     'max_features': ['sqrt', 'log2', None],
    
    
    print(vocab_size, model_type, text_column, ngram, vectorizer, criterion, max_depth, max_features)
    
    if vocab_size:
        vocab_size = int(vocab_size)
        
    # vectorizers = [TfidfVectorizer, CountVectorizer]
        
    if vectorizer == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer
    elif vectorizer == 'CountVectorizer':
        vectorizer = CountVectorizer

    else:
        print('warning, unrecognized vectorizer!')
        print(i, param_set)
    
    print('vectorizing')
    model_vectorizer = vectorizer(max_features=vocab_size, ngram_range=ngram)

    print('creating BOW')
    
    BOW = model_vectorizer.fit_transform(train_X[text_column])
    BOW_array = BOW.toarray()
    
    print('splitting train-validation data')
    


    

    kfolds = KFold(n_splits=k)
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, max_features=max_features)
    
    acc_score = []
    for train_index , valid_index in kfolds.split(BOW_array):
        X_train , X_test = BOW_array[train_index,:],BOW_array[valid_index,:]
        y_train , y_test = train_y[train_index] , train_y[valid_index]
        
        model.fit(X_train[:],y_train[:])
        pred_values = model.predict(X_test)

        acc = accuracy_score(pred_values , y_test)
        acc_score.append(acc)
        
        pred_probs = model.predict_proba(X_test)

    mean_acc = sum(acc_score)/len(acc_score)
    
    print(mean_acc, '\n\n')
    model_accuracy.append(mean_acc)
    top_3_predictions.append(pred_probs)
    
    
    

    
    
  
    
    

In [None]:
top_3_params['acc3'] = model_accuracy
top_3_params = top_3_params.sort_values(by='acc3', ascending=False).reset_index(drop=True)
top_3_params

In [None]:
top_3_params.to_csv('decision_tree_top_3_params.csv', index=False)