In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import nltk
import string

from sklearn.model_selection import learning_curve


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import train_test_split

## Read and Prep Data


In [None]:
tqdm.pandas()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive') 
path = '/content/gdrive/MyDrive/IFT 6390/kaggle2/'

Mounted at /content/gdrive


In [None]:
input_path = path+'data/input/'

In [None]:
all_train_data = pd.read_csv(input_path + 'train_data_prepped.csv').fillna('')
all_test_data = pd.read_csv(input_path + 'test_data_prepped.csv').fillna('')

In [None]:
all_train_data.head(1)

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized
0,0,anyway im getting of for a while,train,anyway im getting of for a while,anyway im getting of for a while,anyway im getting,anyway im get of for a while,anyway im get of for a whil,anyway im getting of for a while


In [None]:
all_test_data.head(1)

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized
0,0,so cut... missed out on all the tce extreme me...,test,so cut missed out on all the tce extreme merch...,so cut missed out on all the tce extreme merch...,cut missed tce extreme merch! sizes,so cut miss out on all the tce extrem merch! i...,so cut miss out on al the tce extrem merch! in...,so cut missed out on all the tce extreme merch...


In [None]:
all_train_data[all_train_data.isnull().any(axis=1)]

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized


In [None]:
all_test_data[all_test_data.isnull().any(axis=1)]

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized


load train targets, binarize, drop nuetrals

In [None]:
train_target = pd.read_csv(input_path + 'train_results.csv')
train_target['target'].value_counts()

positive    520436
negative    519803
neutral         84
Name: target, dtype: int64

In [None]:
train_target.head()

Unnamed: 0,id,target
0,0,positive
1,1,negative
2,2,positive
3,3,positive
4,4,negative


In [None]:
train_target['target'] = [0 if t == 'negative' else 2 if t == 'positive' else 2 for t in train_target['target'].values]

In [None]:
all_train_data['target'] = train_target['target']

drop nuetral target rows to keep it binary

In [None]:
# all_train_data = all_train_data[all_train_data['target'] != 'neutral']
# all_train_data['target'] = [1 if t=='positive' else 0 for t in all_train_data['target'].values]
all_train_data['target'].value_counts()


2    520520
0    519803
Name: target, dtype: int64

### prepare bag of words<br>
we need a different model for each text format

Naive Bayes doesn't have any hyperparameters, so we'll test different versions of the text processing with different versions of naive bayes and corpos generation

the following parameters will be tested<br>
1. size of corpus vocab
2. corpus ngram varieties
3. bernoulli, multinomial or gaussian naive bayes
4. stemming/lemmatizing methods

In [None]:
vocab_sizes = [128, 1024, 8096, 16384]
ngrams = [(1, 1), (2, 2), (1, 2), (1, 3), (1, 4)]
models = [BernoulliNB(), GaussianNB(), MultinomialNB()]
text_columns = ['text_no_numerals', 'text_no_sw', 'text_porter_stemmed', 'text_lancaster_stemmed', 'text_lemmatized']
threshold = 0.5

# shuffle train data
all_train_data = all_train_data.sample(len(all_train_data))

# get data and target
train_X = all_train_data.iloc[:, :-1]
train_y  = all_train_data.iloc[:, -1]


# get 5000 data points for first round of training
train_v1 = train_X[:10_000]
targets_v1 = train_y[:10_000]

In [None]:
targets_v1.head()

248551    0
974374    2
400458    0
207770    0
1558      0
Name: target, dtype: int64

In [None]:
def trainModels(df, targets, vocab_sizes, models, text_columns, ngrams, threshold):

    param_tracker = []
    counter = 1
    for size in tqdm(vocab_sizes):
#         print(f'vocab size: {size}')

        for model in models:
#             print(f'model type: {model}')

            for col in text_columns:
#                 print(f'processing method: {col}')


                for ng in tqdm(ngrams):
#                     print(f'ngram range: {ng}')

                    counter+=1
                    if counter % 10==0:
                        print(f'{counter}')
                    vectorizer = CountVectorizer(max_features=size, ngram_range=ng)

#                     print('creating bag of words')
                    BOW = vectorizer.fit_transform(df[col])
                    BOW_array = BOW.toarray()

                    X_train, X_valid, y_train, y_valid = train_test_split(BOW_array, targets, test_size=0.2, random_state=42)
#                     print('fitting data')  


                    model.fit(X_train, y_train)


                    prob_predictions = model.predict_proba(X_valid)

                    # gets index of 1 column in prob_predictions
                    pos_predictions = [pred[list(model.classes_).index(2)] for pred in prob_predictions]

                    num_correct_pred = 0
                    for pred, actual in zip(pos_predictions, y_valid):
                        if pred >= threshold:
                            binary_pred = 2
                        else:
                            binary_pred = 0

                        if binary_pred==actual:
                            num_correct_pred+=1
                    accuracy = num_correct_pred / len(y_valid)
                    print(f'accuracy: {accuracy}')
                    print('\n\n')

                    param_tracker.append((size, str(model), col, ng, accuracy))
                    
    param_df = pd.DataFrame(param_tracker, columns = ['vocab_size', 'model_type', 'text_column', 'ngram', 'accuracy_round1'])
    param_df = param_df.sort_values(by='accuracy_round1', ascending=False)
    return param_df

                    
                
    # for vocab size we'll test

In [None]:
param_df = trainModels(train_v1, targets_v1, vocab_sizes, models, text_columns, ngrams, threshold)

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.10it/s][A

accuracy: 0.664






 40%|████      | 2/5 [00:00<00:01,  2.05it/s][A

accuracy: 0.589






 60%|██████    | 3/5 [00:01<00:01,  1.81it/s][A

accuracy: 0.664






 80%|████████  | 4/5 [00:02<00:00,  1.34it/s][A

accuracy: 0.664






100%|██████████| 5/5 [00:04<00:00,  1.19it/s]


accuracy: 0.664






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.00it/s][A

accuracy: 0.65






 40%|████      | 2/5 [00:00<00:00,  3.19it/s][A

accuracy: 0.551






 60%|██████    | 3/5 [00:01<00:00,  2.65it/s][A

accuracy: 0.65



10



 80%|████████  | 4/5 [00:01<00:00,  2.00it/s][A

accuracy: 0.65






100%|██████████| 5/5 [00:02<00:00,  1.90it/s]


accuracy: 0.65






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  3.96it/s][A

accuracy: 0.672






 40%|████      | 2/5 [00:00<00:01,  2.60it/s][A

accuracy: 0.5885






 60%|██████    | 3/5 [00:01<00:00,  2.14it/s][A

accuracy: 0.674






 80%|████████  | 4/5 [00:02<00:00,  1.46it/s][A

accuracy: 0.674






100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


accuracy: 0.6725






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.21it/s][A

accuracy: 0.677






 40%|████      | 2/5 [00:00<00:01,  2.68it/s][A

accuracy: 0.5855






 60%|██████    | 3/5 [00:01<00:00,  2.19it/s][A

accuracy: 0.6765



20



 80%|████████  | 4/5 [00:02<00:00,  1.49it/s][A

accuracy: 0.6765






100%|██████████| 5/5 [00:03<00:00,  1.34it/s]


accuracy: 0.6765






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  3.85it/s][A

accuracy: 0.669






 40%|████      | 2/5 [00:00<00:01,  2.61it/s][A

accuracy: 0.588






 60%|██████    | 3/5 [00:01<00:01,  1.63it/s][A

accuracy: 0.668






 80%|████████  | 4/5 [00:04<00:01,  1.36s/it][A

accuracy: 0.668






100%|██████████| 5/5 [00:06<00:00,  1.24s/it]


accuracy: 0.668






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.34it/s][A

accuracy: 0.6385






 40%|████      | 2/5 [00:01<00:02,  1.22it/s][A

accuracy: 0.589






 60%|██████    | 3/5 [00:02<00:01,  1.42it/s][A

accuracy: 0.635



30



 80%|████████  | 4/5 [00:03<00:00,  1.24it/s][A

accuracy: 0.635






100%|██████████| 5/5 [00:05<00:00,  1.04s/it]


accuracy: 0.635






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  3.04it/s][A

accuracy: 0.6395






 40%|████      | 2/5 [00:00<00:01,  2.14it/s][A

accuracy: 0.5445






 60%|██████    | 3/5 [00:01<00:01,  1.51it/s][A

accuracy: 0.6395






 80%|████████  | 4/5 [00:03<00:00,  1.08it/s][A

accuracy: 0.6395






100%|██████████| 5/5 [00:05<00:00,  1.03s/it]


accuracy: 0.6395






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.53it/s][A

accuracy: 0.6565






 40%|████      | 2/5 [00:01<00:02,  1.30it/s][A

accuracy: 0.588






 60%|██████    | 3/5 [00:02<00:02,  1.03s/it][A

accuracy: 0.652



40



 80%|████████  | 4/5 [00:04<00:01,  1.32s/it][A

accuracy: 0.652






100%|██████████| 5/5 [00:06<00:00,  1.27s/it]


accuracy: 0.65






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.83it/s][A

accuracy: 0.652






 40%|████      | 2/5 [00:00<00:01,  2.93it/s][A

accuracy: 0.586






 60%|██████    | 3/5 [00:01<00:00,  2.28it/s][A

accuracy: 0.644






 80%|████████  | 4/5 [00:02<00:00,  1.55it/s][A

accuracy: 0.644






100%|██████████| 5/5 [00:06<00:00,  1.21s/it]


accuracy: 0.644






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.02s/it][A

accuracy: 0.6475






 40%|████      | 2/5 [00:02<00:04,  1.53s/it][A

accuracy: 0.586






 60%|██████    | 3/5 [00:04<00:02,  1.39s/it][A

accuracy: 0.64



50



 80%|████████  | 4/5 [00:06<00:01,  1.58s/it][A

accuracy: 0.64






100%|██████████| 5/5 [00:07<00:00,  1.52s/it]


accuracy: 0.64






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.71it/s][A

accuracy: 0.6595






 40%|████      | 2/5 [00:00<00:01,  2.75it/s][A

accuracy: 0.5685






 60%|██████    | 3/5 [00:01<00:00,  2.18it/s][A

accuracy: 0.654






 80%|████████  | 4/5 [00:02<00:00,  1.47it/s][A

accuracy: 0.654






100%|██████████| 5/5 [00:03<00:00,  1.33it/s]


accuracy: 0.654






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.44it/s][A

accuracy: 0.6335






 40%|████      | 2/5 [00:00<00:00,  3.29it/s][A

accuracy: 0.5475






 60%|██████    | 3/5 [00:01<00:00,  2.71it/s][A

accuracy: 0.6335



60



 80%|████████  | 4/5 [00:01<00:00,  2.00it/s][A

accuracy: 0.6335






100%|██████████| 5/5 [00:02<00:00,  1.88it/s]


accuracy: 0.6335






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.58it/s][A

accuracy: 0.671






 40%|████      | 2/5 [00:00<00:01,  2.76it/s][A

accuracy: 0.58






 60%|██████    | 3/5 [00:01<00:00,  2.15it/s][A

accuracy: 0.6705






 80%|████████  | 4/5 [00:02<00:00,  1.45it/s][A

accuracy: 0.6705






100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


accuracy: 0.6715






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.42it/s][A

accuracy: 0.669






 40%|████      | 2/5 [00:00<00:01,  2.77it/s][A

accuracy: 0.58






 60%|██████    | 3/5 [00:01<00:00,  2.18it/s][A

accuracy: 0.6635



70



 80%|████████  | 4/5 [00:02<00:00,  1.48it/s][A

accuracy: 0.6635






100%|██████████| 5/5 [00:03<00:00,  1.34it/s]


accuracy: 0.6635






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:00,  4.04it/s][A

accuracy: 0.661






 40%|████      | 2/5 [00:00<00:01,  2.70it/s][A

accuracy: 0.5725






 60%|██████    | 3/5 [00:01<00:00,  2.12it/s][A

accuracy: 0.661






 80%|████████  | 4/5 [00:02<00:00,  1.45it/s][A

accuracy: 0.661






100%|██████████| 5/5 [00:03<00:00,  1.32it/s]
 25%|██▌       | 1/4 [01:08<03:26, 68.98s/it]

accuracy: 0.661






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.31it/s][A

accuracy: 0.7195






 40%|████      | 2/5 [00:01<00:02,  1.21it/s][A

accuracy: 0.646






 60%|██████    | 3/5 [00:02<00:01,  1.14it/s][A

accuracy: 0.7215



80



 80%|████████  | 4/5 [00:03<00:01,  1.06s/it][A

accuracy: 0.719






100%|██████████| 5/5 [00:05<00:00,  1.15s/it]


accuracy: 0.7225






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.91it/s][A

accuracy: 0.699






 40%|████      | 2/5 [00:01<00:01,  1.60it/s][A

accuracy: 0.574






 60%|██████    | 3/5 [00:02<00:01,  1.44it/s][A

accuracy: 0.7005






 80%|████████  | 4/5 [00:03<00:00,  1.22it/s][A

accuracy: 0.6985






100%|██████████| 5/5 [00:04<00:00,  1.19it/s]


accuracy: 0.7






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.80it/s][A

accuracy: 0.7295






 40%|████      | 2/5 [00:01<00:02,  1.47it/s][A

accuracy: 0.654






 60%|██████    | 3/5 [00:02<00:01,  1.29it/s][A

accuracy: 0.725



90



 80%|████████  | 4/5 [00:03<00:00,  1.02it/s][A

accuracy: 0.7225






100%|██████████| 5/5 [00:05<00:00,  1.05s/it]


accuracy: 0.7225






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.82it/s][A

accuracy: 0.7285






 40%|████      | 2/5 [00:01<00:02,  1.49it/s][A

accuracy: 0.6575






 60%|██████    | 3/5 [00:02<00:01,  1.27it/s][A

accuracy: 0.7255






 80%|████████  | 4/5 [00:03<00:00,  1.00it/s][A

accuracy: 0.722






100%|██████████| 5/5 [00:05<00:00,  1.06s/it]


accuracy: 0.724






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.80it/s][A

accuracy: 0.7185






 40%|████      | 2/5 [00:01<00:02,  1.46it/s][A

accuracy: 0.649






 60%|██████    | 3/5 [00:02<00:01,  1.30it/s][A

accuracy: 0.7265



100



 80%|████████  | 4/5 [00:03<00:00,  1.01it/s][A

accuracy: 0.7265






100%|██████████| 5/5 [00:05<00:00,  1.06s/it]


accuracy: 0.724






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.11it/s][A

accuracy: 0.6605






 40%|████      | 2/5 [00:01<00:01,  1.78it/s][A

accuracy: 0.62






 60%|██████    | 3/5 [00:01<00:01,  1.54it/s][A

accuracy: 0.6975






 80%|████████  | 4/5 [00:03<00:00,  1.13it/s][A

accuracy: 0.702






100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


accuracy: 0.699






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.45it/s][A

accuracy: 0.6455






 40%|████      | 2/5 [00:00<00:01,  2.10it/s][A

accuracy: 0.5625






 60%|██████    | 3/5 [00:01<00:01,  1.85it/s][A

accuracy: 0.66



110



 80%|████████  | 4/5 [00:02<00:00,  1.53it/s][A

accuracy: 0.667






100%|██████████| 5/5 [00:03<00:00,  1.46it/s]


accuracy: 0.665






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.47it/s][A

accuracy: 0.654






 40%|████      | 2/5 [00:01<00:01,  1.90it/s][A

accuracy: 0.624






 60%|██████    | 3/5 [00:01<00:01,  1.62it/s][A

accuracy: 0.6835






 80%|████████  | 4/5 [00:02<00:00,  1.20it/s][A

accuracy: 0.688






100%|██████████| 5/5 [00:04<00:00,  1.11it/s]


accuracy: 0.687






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.33it/s][A

accuracy: 0.664






 40%|████      | 2/5 [00:01<00:01,  1.86it/s][A

accuracy: 0.628






 60%|██████    | 3/5 [00:01<00:01,  1.60it/s][A

accuracy: 0.6895



120



 80%|████████  | 4/5 [00:02<00:00,  1.21it/s][A

accuracy: 0.696






100%|██████████| 5/5 [00:04<00:00,  1.11it/s]


accuracy: 0.6955






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.41it/s][A

accuracy: 0.652






 40%|████      | 2/5 [00:01<00:01,  1.90it/s][A

accuracy: 0.622






 60%|██████    | 3/5 [00:01<00:01,  1.60it/s][A

accuracy: 0.6915






 80%|████████  | 4/5 [00:02<00:00,  1.20it/s][A

accuracy: 0.6925






100%|██████████| 5/5 [00:04<00:00,  1.10it/s]


accuracy: 0.693






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.08it/s][A

accuracy: 0.711






 40%|████      | 2/5 [00:01<00:01,  1.63it/s][A

accuracy: 0.612






 60%|██████    | 3/5 [00:02<00:01,  1.36it/s][A

accuracy: 0.709



130



 80%|████████  | 4/5 [00:03<00:01,  1.09s/it][A

accuracy: 0.7075






100%|██████████| 5/5 [00:05<00:00,  1.08s/it]


accuracy: 0.71






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.22it/s][A

accuracy: 0.6895






 40%|████      | 2/5 [00:01<00:01,  1.79it/s][A

accuracy: 0.5585






 60%|██████    | 3/5 [00:01<00:01,  1.62it/s][A

accuracy: 0.69






 80%|████████  | 4/5 [00:02<00:00,  1.34it/s][A

accuracy: 0.6905






100%|██████████| 5/5 [00:03<00:00,  1.30it/s]


accuracy: 0.693






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.05it/s][A

accuracy: 0.7195






 40%|████      | 2/5 [00:01<00:01,  1.62it/s][A

accuracy: 0.618






 60%|██████    | 3/5 [00:02<00:01,  1.40it/s][A

accuracy: 0.7185



140



 80%|████████  | 4/5 [00:03<00:00,  1.09it/s][A

accuracy: 0.719






100%|██████████| 5/5 [00:04<00:00,  1.01it/s]


accuracy: 0.719






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  1.92it/s][A

accuracy: 0.713






 40%|████      | 2/5 [00:01<00:01,  1.58it/s][A

accuracy: 0.618






 60%|██████    | 3/5 [00:02<00:01,  1.42it/s][A

accuracy: 0.711






 80%|████████  | 4/5 [00:03<00:01,  1.03s/it][A

accuracy: 0.71






100%|██████████| 5/5 [00:05<00:00,  1.05s/it]


accuracy: 0.7105






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:02,  2.00it/s][A

accuracy: 0.71






 40%|████      | 2/5 [00:01<00:01,  1.63it/s][A

accuracy: 0.6085






 60%|██████    | 3/5 [00:01<00:01,  1.43it/s][A

accuracy: 0.712



150



 80%|████████  | 4/5 [00:03<00:00,  1.10it/s][A

accuracy: 0.713






100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
 50%|█████     | 2/4 [02:21<02:21, 70.84s/it]

accuracy: 0.7125






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:05<00:20,  5.12s/it][A

accuracy: 0.725






 40%|████      | 2/5 [00:09<00:14,  4.92s/it][A

accuracy: 0.678






 60%|██████    | 3/5 [00:13<00:08,  4.21s/it][A

accuracy: 0.738






 80%|████████  | 4/5 [00:17<00:04,  4.31s/it][A

accuracy: 0.7355






100%|██████████| 5/5 [00:26<00:00,  5.25s/it]


accuracy: 0.7365






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:12,  3.23s/it][A

accuracy: 0.7095






 40%|████      | 2/5 [00:06<00:10,  3.53s/it][A

accuracy: 0.6065






 60%|██████    | 3/5 [00:11<00:08,  4.04s/it][A

accuracy: 0.7155



160



 80%|████████  | 4/5 [00:15<00:03,  3.81s/it][A

accuracy: 0.7175






100%|██████████| 5/5 [00:19<00:00,  3.88s/it]


accuracy: 0.716






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:17,  4.39s/it][A

accuracy: 0.7275






 40%|████      | 2/5 [00:07<00:11,  3.72s/it][A

accuracy: 0.685






 60%|██████    | 3/5 [00:11<00:07,  3.81s/it][A

accuracy: 0.738






 80%|████████  | 4/5 [00:16<00:04,  4.32s/it][A

accuracy: 0.737






100%|██████████| 5/5 [00:20<00:00,  4.17s/it]


accuracy: 0.741






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:14,  3.57s/it][A

accuracy: 0.726






 40%|████      | 2/5 [00:08<00:12,  4.17s/it][A

accuracy: 0.6815






 60%|██████    | 3/5 [00:11<00:07,  3.76s/it][A

accuracy: 0.7295



170



 80%|████████  | 4/5 [00:15<00:03,  3.96s/it][A

accuracy: 0.731






100%|██████████| 5/5 [00:21<00:00,  4.32s/it]


accuracy: 0.729






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:11,  2.96s/it][A

accuracy: 0.722






 40%|████      | 2/5 [00:06<00:10,  3.44s/it][A

accuracy: 0.6775






 60%|██████    | 3/5 [00:11<00:08,  4.04s/it][A

accuracy: 0.733






 80%|████████  | 4/5 [00:15<00:04,  4.07s/it][A

accuracy: 0.7295






100%|██████████| 5/5 [00:20<00:00,  4.07s/it]


accuracy: 0.7305






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:07,  1.83s/it][A

accuracy: 0.5735






 40%|████      | 2/5 [00:03<00:05,  1.92s/it][A

accuracy: 0.618






 60%|██████    | 3/5 [00:05<00:04,  2.01s/it][A

accuracy: 0.6465



180



 80%|████████  | 4/5 [00:08<00:02,  2.23s/it][A

accuracy: 0.648






100%|██████████| 5/5 [00:11<00:00,  2.29s/it]


accuracy: 0.642






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:06,  1.74s/it][A

accuracy: 0.5725






 40%|████      | 2/5 [00:03<00:05,  1.82s/it][A

accuracy: 0.577






 60%|██████    | 3/5 [00:05<00:03,  1.87s/it][A

accuracy: 0.6335






 80%|████████  | 4/5 [00:07<00:01,  1.99s/it][A

accuracy: 0.633






100%|██████████| 5/5 [00:10<00:00,  2.02s/it]


accuracy: 0.6405






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:06,  1.72s/it][A

accuracy: 0.5605






 40%|████      | 2/5 [00:03<00:05,  1.87s/it][A

accuracy: 0.614






 60%|██████    | 3/5 [00:05<00:03,  1.97s/it][A

accuracy: 0.6235



190



 80%|████████  | 4/5 [00:08<00:02,  2.16s/it][A

accuracy: 0.618






100%|██████████| 5/5 [00:11<00:00,  2.23s/it]


accuracy: 0.6315






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:07,  1.77s/it][A

accuracy: 0.5625






 40%|████      | 2/5 [00:03<00:05,  1.86s/it][A

accuracy: 0.616






 60%|██████    | 3/5 [00:05<00:03,  1.96s/it][A

accuracy: 0.63






 80%|████████  | 4/5 [00:08<00:02,  2.16s/it][A

accuracy: 0.641






100%|██████████| 5/5 [00:11<00:00,  2.22s/it]


accuracy: 0.638






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:07,  1.78s/it][A

accuracy: 0.5625






 40%|████      | 2/5 [00:03<00:05,  1.92s/it][A

accuracy: 0.619






 60%|██████    | 3/5 [00:05<00:03,  1.98s/it][A

accuracy: 0.639



200



 80%|████████  | 4/5 [00:08<00:02,  2.18s/it][A

accuracy: 0.6375






100%|██████████| 5/5 [00:11<00:00,  2.26s/it]


accuracy: 0.6475






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:12,  3.15s/it][A

accuracy: 0.728






 40%|████      | 2/5 [00:07<00:11,  3.77s/it][A

accuracy: 0.6635






 60%|██████    | 3/5 [00:10<00:06,  3.36s/it][A

accuracy: 0.739






 80%|████████  | 4/5 [00:15<00:04,  4.01s/it][A

accuracy: 0.7445






100%|██████████| 5/5 [00:20<00:00,  4.10s/it]


accuracy: 0.7395






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:09,  2.50s/it][A

accuracy: 0.708






 40%|████      | 2/5 [00:05<00:08,  2.96s/it][A

accuracy: 0.603






 60%|██████    | 3/5 [00:09<00:07,  3.50s/it][A

accuracy: 0.7135



210



 80%|████████  | 4/5 [00:12<00:03,  3.28s/it][A

accuracy: 0.715






100%|██████████| 5/5 [00:16<00:00,  3.33s/it]


accuracy: 0.7165






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:15,  3.92s/it][A

accuracy: 0.718






 40%|████      | 2/5 [00:06<00:09,  3.22s/it][A

accuracy: 0.6675






 60%|██████    | 3/5 [00:10<00:06,  3.31s/it][A

accuracy: 0.7325






 80%|████████  | 4/5 [00:14<00:03,  3.83s/it][A

accuracy: 0.739






100%|██████████| 5/5 [00:18<00:00,  3.66s/it]


accuracy: 0.7405






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:12,  3.13s/it][A

accuracy: 0.715






 40%|████      | 2/5 [00:07<00:11,  3.70s/it][A

accuracy: 0.6635






 60%|██████    | 3/5 [00:10<00:06,  3.30s/it][A

accuracy: 0.729



220



 80%|████████  | 4/5 [00:13<00:03,  3.51s/it][A

accuracy: 0.7345






100%|██████████| 5/5 [00:18<00:00,  3.79s/it]


accuracy: 0.7325






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:10,  2.53s/it][A

accuracy: 0.7185






 40%|████      | 2/5 [00:05<00:09,  3.00s/it][A

accuracy: 0.6665






 60%|██████    | 3/5 [00:10<00:07,  3.60s/it][A

accuracy: 0.7315






 80%|████████  | 4/5 [00:13<00:03,  3.47s/it][A

accuracy: 0.734






100%|██████████| 5/5 [00:17<00:00,  3.56s/it]
 75%|███████▌  | 3/4 [06:37<02:35, 155.40s/it]

accuracy: 0.7325






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:11<00:44, 11.20s/it][A

accuracy: 0.7245






 40%|████      | 2/5 [00:21<00:31, 10.39s/it][A

accuracy: 0.678






 60%|██████    | 3/5 [00:30<00:20, 10.18s/it][A

accuracy: 0.7345



230



 80%|████████  | 4/5 [00:41<00:10, 10.16s/it][A

accuracy: 0.735






100%|██████████| 5/5 [00:50<00:00, 10.17s/it]


accuracy: 0.738






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:08<00:35,  8.85s/it][A

accuracy: 0.7075






 40%|████      | 2/5 [00:18<00:27,  9.24s/it][A

accuracy: 0.6065






 60%|██████    | 3/5 [00:27<00:18,  9.06s/it][A

accuracy: 0.718






 80%|████████  | 4/5 [00:37<00:09,  9.37s/it][A

accuracy: 0.717






100%|██████████| 5/5 [00:47<00:00,  9.40s/it]


accuracy: 0.7145






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:06<00:24,  6.21s/it][A

accuracy: 0.7245






 40%|████      | 2/5 [00:16<00:25,  8.38s/it][A

accuracy: 0.6845






 60%|██████    | 3/5 [00:25<00:17,  8.80s/it][A

accuracy: 0.7345



240



 80%|████████  | 4/5 [00:34<00:09,  9.01s/it][A

accuracy: 0.7345






100%|██████████| 5/5 [00:45<00:00,  9.06s/it]


accuracy: 0.7305






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:07<00:28,  7.23s/it][A

accuracy: 0.726






 40%|████      | 2/5 [00:16<00:26,  8.69s/it][A

accuracy: 0.6895






 60%|██████    | 3/5 [00:26<00:17,  8.98s/it][A

accuracy: 0.7375






 80%|████████  | 4/5 [00:35<00:09,  9.12s/it][A

accuracy: 0.7395






100%|██████████| 5/5 [00:46<00:00,  9.21s/it]


accuracy: 0.733






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:08<00:32,  8.02s/it][A

accuracy: 0.7195






 40%|████      | 2/5 [00:17<00:26,  8.80s/it][A

accuracy: 0.685






 60%|██████    | 3/5 [00:26<00:17,  8.90s/it][A

accuracy: 0.73



250



 80%|████████  | 4/5 [00:35<00:08,  8.93s/it][A

accuracy: 0.728






100%|██████████| 5/5 [00:45<00:00,  9.18s/it]


accuracy: 0.731






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:11,  2.95s/it][A

accuracy: 0.576






 40%|████      | 2/5 [00:06<00:09,  3.28s/it][A

accuracy: 0.629






 60%|██████    | 3/5 [00:09<00:06,  3.34s/it][A

accuracy: 0.642






 80%|████████  | 4/5 [00:13<00:03,  3.64s/it][A

accuracy: 0.6515






100%|██████████| 5/5 [00:18<00:00,  3.70s/it]


accuracy: 0.654






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:11,  2.96s/it][A

accuracy: 0.572






 40%|████      | 2/5 [00:06<00:09,  3.25s/it][A

accuracy: 0.569






 60%|██████    | 3/5 [00:10<00:06,  3.40s/it][A

accuracy: 0.612



260



 80%|████████  | 4/5 [00:13<00:03,  3.51s/it][A

accuracy: 0.62






100%|██████████| 5/5 [00:17<00:00,  3.51s/it]


accuracy: 0.63






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:10,  2.63s/it][A

accuracy: 0.55






 40%|████      | 2/5 [00:06<00:09,  3.07s/it][A

accuracy: 0.6175






 60%|██████    | 3/5 [00:09<00:06,  3.30s/it][A

accuracy: 0.63






 80%|████████  | 4/5 [00:13<00:03,  3.59s/it][A

accuracy: 0.63






100%|██████████| 5/5 [00:18<00:00,  3.62s/it]


accuracy: 0.6305






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:09,  2.34s/it][A

accuracy: 0.558






 40%|████      | 2/5 [00:05<00:09,  3.04s/it][A

accuracy: 0.6245






 60%|██████    | 3/5 [00:09<00:06,  3.27s/it][A

accuracy: 0.632



270



 80%|████████  | 4/5 [00:13<00:03,  3.54s/it][A

accuracy: 0.644






100%|██████████| 5/5 [00:17<00:00,  3.57s/it]


accuracy: 0.6395






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:11,  2.86s/it][A

accuracy: 0.5665






 40%|████      | 2/5 [00:06<00:09,  3.19s/it][A

accuracy: 0.6225






 60%|██████    | 3/5 [00:09<00:06,  3.38s/it][A

accuracy: 0.6395






 80%|████████  | 4/5 [00:13<00:03,  3.65s/it][A

accuracy: 0.6415






100%|██████████| 5/5 [00:18<00:00,  3.73s/it]


accuracy: 0.644






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:08<00:32,  8.22s/it][A

accuracy: 0.725






 40%|████      | 2/5 [00:16<00:25,  8.54s/it][A

accuracy: 0.6695






 60%|██████    | 3/5 [00:26<00:17,  8.77s/it][A

accuracy: 0.7325



280



 80%|████████  | 4/5 [00:35<00:08,  8.93s/it][A

accuracy: 0.7365






100%|██████████| 5/5 [00:44<00:00,  8.85s/it]


accuracy: 0.736






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:08<00:32,  8.11s/it][A

accuracy: 0.71






 40%|████      | 2/5 [00:18<00:29,  9.72s/it][A

accuracy: 0.5985






 60%|██████    | 3/5 [00:28<00:18,  9.46s/it][A

accuracy: 0.715






 80%|████████  | 4/5 [00:36<00:08,  8.91s/it][A

accuracy: 0.7145






100%|██████████| 5/5 [00:45<00:00,  9.18s/it]


accuracy: 0.716






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:07<00:29,  7.36s/it][A

accuracy: 0.7205






 40%|████      | 2/5 [00:16<00:25,  8.51s/it][A

accuracy: 0.676






 60%|██████    | 3/5 [00:23<00:15,  7.84s/it][A

accuracy: 0.7355



290



 80%|████████  | 4/5 [00:33<00:08,  8.64s/it][A

accuracy: 0.7345






100%|██████████| 5/5 [00:44<00:00,  8.84s/it]


accuracy: 0.7355






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:07<00:29,  7.27s/it][A

accuracy: 0.7155






 40%|████      | 2/5 [00:14<00:21,  7.12s/it][A

accuracy: 0.675






 60%|██████    | 3/5 [00:23<00:16,  8.04s/it][A

accuracy: 0.7295






 80%|████████  | 4/5 [00:33<00:08,  8.74s/it][A

accuracy: 0.7335






100%|██████████| 5/5 [00:42<00:00,  8.43s/it]


accuracy: 0.726






  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:07<00:31,  7.99s/it][A

accuracy: 0.72






 40%|████      | 2/5 [00:17<00:26,  8.68s/it][A

accuracy: 0.6675






 60%|██████    | 3/5 [00:22<00:14,  7.30s/it][A

accuracy: 0.728



300



 80%|████████  | 4/5 [00:32<00:08,  8.23s/it][A

accuracy: 0.732






100%|██████████| 5/5 [00:42<00:00,  8.49s/it]
100%|██████████| 4/4 [15:42<00:00, 235.54s/it]

accuracy: 0.7315








In [None]:
param_df

Unnamed: 0,vocab_size,model_type,text_column,ngram,accuracy_round1
203,8096,MultinomialNB(),text_no_numerals,"(1, 3)",0.7445
164,8096,BernoulliNB(),text_porter_stemmed,"(1, 4)",0.7410
214,8096,MultinomialNB(),text_porter_stemmed,"(1, 4)",0.7405
204,8096,MultinomialNB(),text_no_numerals,"(1, 4)",0.7395
243,16384,BernoulliNB(),text_lancaster_stemmed,"(1, 3)",0.7395
...,...,...,...,...,...
265,16384,GaussianNB(),text_lancaster_stemmed,"(1, 1)",0.5580
6,128,BernoulliNB(),text_no_sw,"(2, 2)",0.5510
260,16384,GaussianNB(),text_porter_stemmed,"(1, 1)",0.5500
56,128,MultinomialNB(),text_no_sw,"(2, 2)",0.5475


In [None]:
param_df

Unnamed: 0,vocab_size,model_type,text_column,ngram,accuracy_round1
203,8096,MultinomialNB(),text_no_numerals,"(1, 3)",0.7445
164,8096,BernoulliNB(),text_porter_stemmed,"(1, 4)",0.7410
214,8096,MultinomialNB(),text_porter_stemmed,"(1, 4)",0.7405
204,8096,MultinomialNB(),text_no_numerals,"(1, 4)",0.7395
243,16384,BernoulliNB(),text_lancaster_stemmed,"(1, 3)",0.7395
...,...,...,...,...,...
265,16384,GaussianNB(),text_lancaster_stemmed,"(1, 1)",0.5580
6,128,BernoulliNB(),text_no_sw,"(2, 2)",0.5510
260,16384,GaussianNB(),text_porter_stemmed,"(1, 1)",0.5500
56,128,MultinomialNB(),text_no_sw,"(2, 2)",0.5475


In [None]:
param_df.to_csv(path+'data/output/naive_bayes_params_round1.csv', index=False)

In [None]:
param_df = pd.read_csv(path+'data/output/naive_bayes_params_round1.csv')

# ngrams get saved as string, convert back to int tuple
param_df['ngram'] = [(int(ng[1]), int(ng[4])) for ng in param_df['ngram'].values]

param_df = param_df.replace({np.nan: None})

In [None]:
top_params = param_df[:10]

based on these results, we'll train the full dataset on these 10 combinations

In [None]:
num_points = 100_000
num_train = int(num_points*0.8)

In [None]:

model_accuracy = []

for i, param_set in enumerate(tqdm(top_params.values)):
    vocab_size, model_type, text_column, ngram, *_ = param_set
    print(vocab_size, model_type, text_column, ngram,)
    
    print('splitting train-validation data')
    
    # train is all training data with targets included!
    # sample to get smaller amount of data to deal with RAM constraints
    train_sample = all_train_data.sample(num_points)

    # get data and target
    sample_train_X = train_sample.iloc[:, :-1]
    sample_train_y  = train_sample.iloc[:, -1]


    

   
    
#     print(len(X_train), len(y_train), len(X_valid), len(y_valid))
    
    if vocab_size:
        vocab_size = int(vocab_size)
    
    if model_type == 'MultinomialNB()':
        model = MultinomialNB()
    elif model_type == 'BernoulliNB()':
        model = BernoulliNB()
    elif model_type == 'GaussianNB()':
        model = GaussianNB()
    else:
        print('warning, unrecognized model!')
        print(i, param_set)
        
    print('vectorizing')
    vectorizer = CountVectorizer(max_features=vocab_size, ngram_range=ngram)

    print('creating BOW')
    
    BOW = vectorizer.fit_transform(sample_train_X[text_column])
    # BOW_array = BOW.toarray()
    

    
    
    # all sample_train_X and sample_train_y are the X, y sample from all of the training data, 
    #+ sample size determined by num_points
    X_train, X_valid, y_train, y_valid = train_test_split(BOW, sample_train_y, test_size=0.2, random_state=42)
    
    print('fitting model')
    model.fit(X_train, y_train)

    print('getting predictions')
    prob_predictions = model.predict_proba(X_valid)

    # gets index of 1 column in prob_predictions
    pos_predictions = [pred[list(model.classes_).index(2)] for pred in prob_predictions]
    
    
    # get accuracy
    num_correct_pred = 0
    for pred, actual in zip(pos_predictions, y_valid):
        if pred >= threshold:
            binary_pred = 2
        else:
            binary_pred = 0

        if binary_pred==actual:
            num_correct_pred+=1
    accuracy = num_correct_pred / len(y_valid)
    print(accuracy)
    print('\n')
    
    model_accuracy.append(accuracy)
    
    

  0%|          | 0/10 [00:00<?, ?it/s]

8096 MultinomialNB() text_no_numerals (1, 4)
splitting train-validation data
vectorizing
creating BOW
fitting model
getting predictions


 10%|█         | 1/10 [00:19<02:58, 19.86s/it]

0.7719


8096 MultinomialNB() text_no_numerals (1, 2)
splitting train-validation data
vectorizing
creating BOW


 20%|██        | 2/10 [00:27<01:40, 12.51s/it]

fitting model
getting predictions
0.7735


8096 MultinomialNB() text_porter_stemmed (1, 3)
splitting train-validation data
vectorizing
creating BOW


 30%|███       | 3/10 [00:37<01:20, 11.51s/it]

fitting model
getting predictions
0.7749


8096 MultinomialNB() text_no_numerals (1, 3)
splitting train-validation data
vectorizing
creating BOW


 40%|████      | 4/10 [00:53<01:20, 13.34s/it]

fitting model
getting predictions
0.77415


8096 BernoulliNB() text_porter_stemmed (1, 2)
splitting train-validation data
vectorizing
creating BOW


 50%|█████     | 5/10 [00:58<00:51, 10.32s/it]

fitting model
getting predictions
0.7645


16384 BernoulliNB() text_no_numerals (1, 4)
splitting train-validation data
vectorizing
creating BOW


 60%|██████    | 6/10 [01:14<00:48, 12.19s/it]

fitting model
getting predictions
0.76545


16384 BernoulliNB() text_lancaster_stemmed (1, 3)
splitting train-validation data
vectorizing
creating BOW


 70%|███████   | 7/10 [01:24<00:34, 11.49s/it]

fitting model
getting predictions
0.7655


8096 BernoulliNB() text_no_numerals (1, 2)
splitting train-validation data
vectorizing
creating BOW


 80%|████████  | 8/10 [01:29<00:18,  9.47s/it]

fitting model
getting predictions
0.7641


8096 MultinomialNB() text_porter_stemmed (1, 4)
splitting train-validation data
vectorizing
creating BOW


 90%|█████████ | 9/10 [01:45<00:11, 11.54s/it]

fitting model
getting predictions
0.7728


8096 BernoulliNB() text_porter_stemmed (1, 4)
splitting train-validation data
vectorizing
creating BOW


100%|██████████| 10/10 [02:01<00:00, 12.16s/it]

fitting model
getting predictions
0.76065







In [None]:
top_params['accuracy_round2'] = model_accuracy

In [None]:
top_params = top_params.sort_values('accuracy_round2', ascending=False)

In [None]:
top_params

Unnamed: 0,vocab_size,model_type,text_column,ngram,accuracy_round1,accuracy_round2
6,8096,MultinomialNB(),text_porter_stemmed,"(1, 3)",0.739,0.7749
0,8096,MultinomialNB(),text_no_numerals,"(1, 3)",0.7445,0.77415
5,8096,MultinomialNB(),text_no_numerals,"(1, 2)",0.739,0.7735
2,8096,MultinomialNB(),text_porter_stemmed,"(1, 4)",0.7405,0.7728
3,8096,MultinomialNB(),text_no_numerals,"(1, 4)",0.7395,0.7719
4,16384,BernoulliNB(),text_lancaster_stemmed,"(1, 3)",0.7395,0.7655
7,16384,BernoulliNB(),text_no_numerals,"(1, 4)",0.738,0.76545
8,8096,BernoulliNB(),text_porter_stemmed,"(1, 2)",0.738,0.7645
9,8096,BernoulliNB(),text_no_numerals,"(1, 2)",0.738,0.7641
1,8096,BernoulliNB(),text_porter_stemmed,"(1, 4)",0.741,0.76065


In [None]:
top_params.to_csv(path+'data/output/naive_bayes_params_round2.csv', index=False)

## train final model with best parameters

In [None]:
all_train_data.head(1)

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized,target
0,0,anyway im getting of for a while,train,anyway im getting of for a while,anyway im getting of for a while,anyway im getting,anyway im get of for a while,anyway im get of for a whil,anyway im getting of for a while,2


In [None]:
# # 8096	BernoulliNB()	text_porter_stemmed	(1, 2)	

# all_train_data = all_train_data.sample(len(all_train_data))


vectorizer = CountVectorizer(max_features=8096, ngram_range=(1, 2))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(all_train_data['text_porter_stemmed'], 
                                                    all_train_data['target'], 
                                                    test_size=0.2, 
                                                    random_state=42)


In [None]:
train_BOW = vectorizer.fit_transform(X_train)



In [None]:
test_BOW = vectorizer.transform(X_test)
# test_BOW_array = test_BOW.toarray()

In [None]:
model = BernoulliNB()
model.fit(train_BOW, y_train, test_BOW, y_test)

MultinomialNB()

In [None]:
model.classes_

array([0, 2])

In [None]:
predictions = model.predict(test_BOW_array)

# # gets index of 1 column in prob_predictions
# pos_predictions = [pred[list(model.classes_).index(1)] for pred in prob_predictions]


# # get accuracy
# num_correct_pred = 0
# for pred, actual in zip(pos_predictions, y_valid):
#     if pred >= threshold:
#         binary_pred = 1
#     else:
#         binary_pred = 0

#     if binary_pred==actual:
#         num_correct_pred+=1
# accuracy = num_correct_pred / len(y_valid)
# print(accuracy)
# print('\n')

In [None]:
pdf = pd.DataFrame(predictions).reset_index()
pdf.columns  = ['id', 'target']


In [None]:
pdf['target'].value_counts()

2    288103
0    272072
Name: target, dtype: int64

In [None]:
# pdf['target'] = [0 if t == 'negative' else 2 if t=='positive' else 2 for t in pdf['target'].values]

In [None]:
pdf.to_csv('data/output/first_pred.csv', index=False)

code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    scoring=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    scoring : str or callable, default=None
        A str (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        scoring=scoring,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    fit_time_argsort = fit_times_mean.argsort()
    fit_time_sorted = fit_times_mean[fit_time_argsort]
    test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
    test_scores_std_sorted = test_scores_std[fit_time_argsort]
    axes[2].grid()
    axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
    axes[2].fill_between(
        fit_time_sorted,
        test_scores_mean_sorted - test_scores_std_sorted,
        test_scores_mean_sorted + test_scores_std_sorted,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt


fig, axes = plt.subplots(3, 2, figsize=(10, 15))

X, y = load_digits(return_X_y=True)

title = "Learning Curves (Naive Bayes)"
# Cross validation with 50 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)

estimator = GaussianNB()
plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=axes[:, 0],
    ylim=(0.7, 1.01),
    cv=cv,
    n_jobs=4,
    scoring="accuracy",
)

title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
estimator = SVC(gamma=0.001)
plot_learning_curve(
    estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4
)