In [None]:
import numpy as np
import os
import pickle
from sklearn.linear_model import LogisticRegression
from yellowbrick.model_selection import ValidationCurve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
#set the path leading to the data file I sent out '1_234_5_balanced_data.pkl'
data_path = ''
with open(os.path.join(data_path, '1_234_5_balanced_data.pkl'), 'rb') as f:
    pickled_data = pickle.load(f)
    data = pickled_data[0]

In [None]:
#Run this super fast to make sure that remapped labels are indeed balanced properly.. all counts should be equal
data.stars.value_counts()

In [None]:
def split(all_data, labels, prop_test = 0.3, make_arrays=True, save_data=False):
    [x_train, x_test, y_train, y_test] = train_test_split(all_data, labels, test_size=prop_test, stratify=labels)

    if make_arrays:
        if 'DataFrame' in str(type(all_data)) or 'Series' in str(type(all_data)):
            x_train = x_train.to_numpy()
            x_test = x_test.to_numpy()
        if 'DataFrame' in str(type(labels)) or 'Series' in str(type(labels)):
            y_train = y_train.to_numpy()
            y_test = y_test.to_numpy()

    if save_data:
        with open('train.pkl', 'wb') as f:
            pickle.dump([x_train, y_train], f)
        with open('test.pkl', 'wb') as f:
            pickle.dump([x_test, y_test], f)

    return [x_train, x_test, y_train, y_test]

In [None]:
#Split data 80/20 into train & test files
[x_train, x_test, y_train, y_test] = split(data.loc[:,'processed_text'], data.loc[:,'stars'], prop_test = 0.2, save_data=False, make_arrays=False)

In [None]:
#Make sure x_test was created properly
x_test.shape

In [None]:
#Make sure x_train was created properly
x_train.shape

In [None]:
print('Fitting tfidfer to x_train')
tfidfer = TfidfVectorizer(ngram_range=(1,2)).fit(x_train)
print('Finished fitting tfidfer to x_train')
print('Transforming x_train with tfidfer')
x_train = tfidfer.transform(x_train)
print('Finished transforming x_train')
print('Transforming x_test with tfidfer')
x_test = tfidfer.transform(x_test)
print('Finished transforming x_test')

In [None]:
#Check that x_train was created properly (should be sparse matrix)
x_train

In [None]:
#Check that x_test was created properly (should be sparse matrix)
x_test

In [None]:
#Note that this will save the tfidfed data to the data_path you specified at the beginning of the notebook
print('Saving the TF-IDFed data\n')
pickle.dump([x_train, y_train], open(os.path.join(data_path, '1_234_5_tfidfed_train.pkl'), 'wb'))
pickle.dump([x_test, y_test], open(os.path.join(data_path, '1_234_5_tfidfed_test.pkl'), 'wb'))
print('Data saved in pickle files')