In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from tqdm import tqdm



from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import train_test_split

In [2]:
tqdm.pandas()

In [3]:
input_path = 'data/input/'

In [4]:
all_train_data = pd.read_csv(input_path + 'train_data_prepped.csv').fillna('')
test = pd.read_csv(input_path + 'test_data_prepped.csv').fillna('')

In [5]:
all_train_data.head()

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized
0,0,anyway im getting of for a while,train,anyway im getting of for a while,anyway im getting of for a while,anyway im getting,anyway im get,anyway im get,anyway im getting
1,1,"my red, apache isn't feelin too well this morn...",train,my red apache isnt feelin too well this mornin...,my red apache isnt feelin too well this mornin...,red apache feelin well morning httpmypictmen,red apach feelin well morn httpmypictmen,red apach feelin wel morn httpmypictmen,red apache feelin well morning httpmypictmen
2,2,@danyelljoy you should be its great. friday w...,train,danyelljoy you should be its great friday wil...,danyelljoy you should be its great friday wil...,danyelljoy great friday great tooooooo,danyelljoy great friday great tooooooo,danyelljoy gre friday gre tooooooo,danyelljoy great friday great tooooooo
3,3,its 11:30pm and i dont wanna sleep; so i debat...,train,its 1130pm and i dont wanna sleep so i debated...,its pm and i dont wanna sleep so i debated wit...,pm wanna sleep debated end decided perfect tim...,pm wanna sleep debat end decid perfect time ba...,pm wann sleep deb end decid perfect tim bake! kid,pm wanna sleep debated end decided perfect tim...
4,4,why does twitter eat my dm's? not happy,train,why does twitter eat my dms? not happy,why does twitter eat my dms? not happy,twitter eat dms? happy,twitter eat dms? happi,twit eat dms? happy,twitter eat dms? happy


In [6]:
all_train_data[all_train_data.isnull().any(axis=1)]

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized


In [7]:
test[test.isnull().any(axis=1)]

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized


load train targets, binarize, drop nuetrals

In [8]:
train_results = pd.read_csv(input_path + 'train_results.csv')
train_results['target'].value_counts()

positive    520436
negative    519803
neutral         84
Name: target, dtype: int64

In [9]:
train_results.head()

Unnamed: 0,id,target
0,0,positive
1,1,negative
2,2,positive
3,3,positive
4,4,negative


In [10]:
all_train_data['target'] = train_results['target']

drop nuetral target rows to keep it binary

In [12]:
all_train_data = all_train_data[all_train_data['target'] != 'neutral']
all_train_data['target'] = [1 if t=='positive' else 0 for t in all_train_data['target'].values]
all_train_data['target'].value_counts()


1    520436
0    519803
Name: target, dtype: int64

### prepare bag of words<br>
we need a different model for each text format

Naive Bayes doesn't have any hyperparameters, so we'll test different versions of the text processing with different versions of naive bayes and corpos generation

the following parameters will be tested<br>
1. size of corpus vocab
2. corpus ngram varieties
3. bernoulli, multinomial or gaussian naive bayes
4. stemming/lemmatizing methods

In [14]:
vocab_sizes = [100, 1_000, 10_000, None]
ngrams = [(1, 1), (2, 2), (1, 2), (1, 3)]
models = [BernoulliNB(), GaussianNB(), MultinomialNB()]
text_columns = ['text_no_numerals', 'text_no_sw', 'text_porter_stemmed', 'text_lancaster_stemmed', 'text_lemmatized']
threshold = 0.5

# shuffle train data
all_train_data = all_train_data.sample(len(all_train_data))

# get data and target
train_X = all_train_data.iloc[:, :-1]
train_y  = all_train_data.iloc[:, -1]


# get 5000 data points for first round of training
train_v1 = train_X[:5000]
targets_v1 = train_y[:5000]

In [15]:
targets_v1.head()

691997    0
176283    1
359235    1
993260    0
209133    0
Name: target, dtype: int64

In [16]:
def trainModels(df, targets, vocab_sizes, models, text_columns, ngrams, threshold):

    param_tracker = []
    counter = 1
    for size in tqdm(vocab_sizes):
#         print(f'vocab size: {size}')

        for model in tqdm(models):
#             print(f'model type: {model}')

            for col in tqdm(text_columns):
#                 print(f'processing method: {col}')


                for ng in tqdm(ngrams):
#                     print(f'ngram range: {ng}')

                    counter+=1
                    if counter % 10==0:
                        print(f'{counter}')
                    vectorizer = CountVectorizer(max_features=size, ngram_range=ng)

#                     print('creating bag of words')
                    BOW = vectorizer.fit_transform(df[col])
                    BOW_array = BOW.toarray()

                    X_train, X_valid, y_train, y_valid = train_test_split(BOW_array, targets, test_size=0.2, random_state=42)
#                     print('fitting data')      
                    model.fit(X_train, y_train)


                    prob_predictions = model.predict_proba(X_valid)

                    # gets index of 1 column in prob_predictions
                    pos_predictions = [pred[list(model.classes_).index(1)] for pred in prob_predictions]

                    num_correct_pred = 0
                    for pred, actual in zip(pos_predictions, y_valid):
                        if pred >= threshold:
                            binary_pred = 1
                        else:
                            binary_pred = 0

                        if binary_pred==actual:
                            num_correct_pred+=1
                    accuracy = num_correct_pred / len(y_valid)
#                     print(f'accuracy: {accuracy}')
#                     print('\n\n')

                    param_tracker.append((size, str(model), col, ng, accuracy))
                    
    param_df = pd.DataFrame(param_tracker, columns = ['vocab_size', 'model_type', 'text_column', 'ngram', 'accuracy'])
    param_df = param_df.sort_values(by='accuracy', ascending=False)
    return param_df

                    
                
    # for vocab size we'll test

In [None]:
param_df = trainModels(train_v1, targets_v1, vocab_sizes, models, text_columns, ngrams, threshold)

In [None]:
param_df[:20]

In [None]:
# param_df.to_csv('data/output/naive_bayes_params.csv', index=False)

In [None]:
param_df = pd.read_csv('data/output/naive_bayes_params.csv')

# ngrams get saved as string, convert back to int tuple
param_df['ngram'] = [(int(ng[1]), int(ng[4])) for ng in param_df['ngram'].values]

based on these results, we'll train the full dataset on these 20 combinations

In [None]:
top_params = param_df[:10]

In [None]:
num_points = 100_000
num_train = int(num_points*0.8)

In [None]:
len(train)

In [None]:

full_data_accuracy = []
for i, param_set in enumerate(tqdm(top_20.values)):
    vocab_size, model_type, text_column, ngram, _ = param_set
    print(vocab_size, model_type, text_column, ngram,)
    
    print('splitting train-validation data')
    
    # train is all training data with targets included!
    train = train.sample(len(train))
    training_data = train_X.iloc[:num_points]
    targets  = train_y.iloc[:num_points]
    
    
    

   
    
#     print(len(X_train), len(y_train), len(X_valid), len(y_valid))
    
    if vocab_size:
        vocab_size = int(vocab_size)
    
    if model_type == 'MultinomialNB()':
        model = MultinomialNB()
    elif model_type == 'BernoulliNB()':
        model = BernoulliNB()
    elif model_type == 'GaussianNB()':
        model = GaussianNB()
    else:
        print('warning, unrecognized model!')
        print(i, param_set)
        
    print('vectorizing')
    vectorizer = CountVectorizer(max_features=vocab_size, ngram_range=ngram)

    print('creating BOW')
    
    BOW = vectorizer.fit_transform(training_data[text_column])
    BOW_array = BOW.toarray()
    
    X_train = BOW_array[:num_train]
    y_train = training_targets[:num_train]
    
    X_valid = BOW_array[num_train:]
    y_valid = training_targets[num_train:]
    
    
    
    #     X_train, X_valid, y_train, y_valid = train_test_split(BOW_array, train_y, test_size=0.2, random_state=42)
    
    print('fitting model')
    model.fit(X_train, y_train)

    print('getting predictions')
    prob_predictions = model.predict_proba(X_valid)

    # gets index of 1 column in prob_predictions
    pos_predictions = [pred[list(model.classes_).index(1)] for pred in prob_predictions]
    
    
    # get accuracy
    num_correct_pred = 0
    for pred, actual in zip(pos_predictions, y_valid):
        if pred >= threshold:
            binary_pred = 1
        else:
            binary_pred = 0

        if binary_pred==actual:
            num_correct_pred+=1
    accuracy = num_correct_pred / len(y_valid)
    print(accuracy)
    print('\n')
    
    full_data_accuracy.append(accuracy)
    
    

In [None]:
top_20['final_validation_acc'] = full_data_accuracy

In [None]:
top_20.to_csv('top_20_naive_bayes.csv', index=False)

## train final model with best parameters

In [None]:
#10000.0 BernoulliNB() text_lemmatized (1, 2)

print('vectorizing')
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 2))

print('creating BOW')

BOW = vectorizer.fit_transform(training_data['text_lemmatized'])
BOW_array = BOW.toarray()


model = BernoulliNB()
model.fit()

In [None]:
len(train_y)

In [None]:
train_y