# Applying algorithm to full dataset

In [16]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.deprecated.doc2vec import LabeledSentence
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords
from string import digits

import pandas as pd
import numpy as np
import string
import re
import random 
import os
import csv
import pickle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')

from sklearn import metrics
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, precision_recall_curve, plot_precision_recall_curve, auc, average_precision_score,classification_report, confusion_matrix, accuracy_score, average_precision_score, precision_score, f1_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedStratifiedKFold, train_test_split,KFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from modAL.uncertainty import entropy_sampling
from modAL.density import information_density

from scipy.stats import entropy
from matplotlib import pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carlyknight/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Training

In [17]:
#### preprocessing -------------------------------
punctuation_dictionary = {s:None for s in list(string.punctuation)}

punctuation_translator = str.maketrans(punctuation_dictionary)

stop_words = set(stopwords.words('english'))

# (remove punctuation, numbers, lowercase, stop words)
def text_cleaner_all(text, punctuation_translator):
    text = text.replace('c("', '')
    text = str(text).translate(punctuation_translator)
    text = text.lower()
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w.lower() in stop_words]
    text = ' '.join(filtered_text)
    return(text)

# (remove punctuation, lowercase, stop words)
def text_cleaner_mod(text, punctuation_translator):
    text = text.replace('c("', '')
    text = str(text).translate(punctuation_translator)
    text = text.lower()
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w.lower() in stop_words]
    text = ' '.join(filtered_text)
    return(text)

# (remove punctuation, lowercase)
def text_cleaner_min(text, punctuation_translator):
    text = text.replace('c("', '')
    text = str(text).translate(punctuation_translator)
    text = text.lower()
    return(text)

In [18]:
#data
clas_dat = pd.read_csv("/Users/carlyknight/Dropbox/PROJECTS/Forecasting Downturns/data/coded_sample_final.csv")
clas_dat = clas_dat.drop_duplicates()


clas_dat.shape

(1196, 5)

In [19]:
#clean
clas_dat["clean_text"] = clas_dat["text"].apply(lambda x: text_cleaner_all(x, punctuation_translator))

# find phrases
phrases1 = Phrases(map(lambda x: x.split(), clas_dat["clean_text"].tolist())) #bigram
phrases2 = Phrases(phrases1[map(lambda x: x.split(), clas_dat["clean_text"].tolist())]) #trigram
clas_dat["phrased_text"] = clas_dat["clean_text"].apply(lambda x: " ".join(phrases2[phrases1[x.split()]]))

In [30]:
# vectorize
vectorizer = CountVectorizer(min_df=5)
tfidfconverter = TfidfTransformer()

X = vectorizer.fit_transform(clas_dat["phrased_text"]).toarray()  
X_tf = tfidfconverter.fit_transform(X).toarray()

y = np.array(clas_dat['final_code'])

In [21]:
#training set
X_train, X_test, y_train, y_test = train_test_split(X_tf, y, test_size=0.2)

model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
scoring = ['accuracy', 'precision']

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 5, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring= ['accuracy', 'precision'],refit = "accuracy")
grid_result = grid_search.fit(X_train, y_train)

# summarize results
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
precisions = grid_result.cv_results_['mean_test_precision']
accuracys =  grid_result.cv_results_['mean_test_accuracy']
std_prec = grid_result.cv_results_['std_test_precision']
std_acc = grid_result.cv_results_['std_test_accuracy']
params = grid_result.cv_results_['params']

for prec, acc, param in zip(precisions, accuracys, params):
    print("Precision: %f (Accuracy: %f) with: %r" % (prec, acc, param)) 

Best Accuracy: 0.726930 using {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
Precision: 0.720995 (Accuracy: 0.711068) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Precision: 0.720995 (Accuracy: 0.711068) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Precision: 0.720794 (Accuracy: 0.711068) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Precision: 0.738717 (Accuracy: 0.721721) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
Precision: 0.738717 (Accuracy: 0.721721) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Precision: 0.738204 (Accuracy: 0.721721) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Precision: 0.773143 (Accuracy: 0.726721) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Precision: 0.772927 (Accuracy: 0.726303) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Precision: 0.772632 (Accuracy: 0.726930) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
Precision: 0.842146 (Accuracy: 0.672583) with: {'C':

In [22]:
y_pred = grid_search.best_estimator_.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

Accuracy:  0.75
Precision:  0.782608695652174
Recall:  0.6428571428571429
F1:  0.7058823529411765


In [23]:
#save
import joblib
joblib.dump(grid_search.best_estimator_, '/Users/carlyknight/Dropbox/PROJECTS/Forecasting Downturns/data/best_estimator_1200-8-16-21.pkl')

['/Users/carlyknight/Dropbox/PROJECTS/Forecasting Downturns/data/best_estimator_1200-8-16-21.pkl']

# Entire dataset

In [24]:
#open all files
fulldat = pd.read_csv("/Users/carlyknight/Dropbox/PROJECTS/Forecasting Downturns/data/all_text.csv")

In [33]:
#create a list of ys
y_pred = []

#iterate over the dataframe 1000 times (chunks of 200)
i=0
for chunk in np.array_split(fulldat, 1000):
    print("Working on chunk: ", str(i))
    
    #clean
    chunk["clean_text"] = chunk["text"].apply(lambda x: text_cleaner_all(x, punctuation_translator))
    
    #find phrases (this will take a long time)
    phrases1 = Phrases(map(lambda x: x.split(), chunk["clean_text"].tolist())) #bigram
    phrases2 = Phrases(phrases1[map(lambda x: x.split(), chunk["clean_text"].tolist())]) #trigram
    chunk["phrased_text"] = chunk["clean_text"].apply(lambda x: " ".join(phrases2[phrases1[x.split()]]))
    
    #vectorize
    X = vectorizer.transform(chunk["phrased_text"]).toarray()
    X_tf = tfidfconverter.transform(X).toarray()

    #predict
    ystar = grid_search.best_estimator_.predict(X_tf)
    y_pred.append(ystar)
    i+=1



Working on chunk:  0
Working on chunk:  1
Working on chunk:  2
Working on chunk:  3
Working on chunk:  4
Working on chunk:  5
Working on chunk:  6
Working on chunk:  7
Working on chunk:  8
Working on chunk:  9
Working on chunk:  10
Working on chunk:  11
Working on chunk:  12
Working on chunk:  13
Working on chunk:  14
Working on chunk:  15
Working on chunk:  16
Working on chunk:  17
Working on chunk:  18
Working on chunk:  19
Working on chunk:  20
Working on chunk:  21
Working on chunk:  22
Working on chunk:  23
Working on chunk:  24
Working on chunk:  25
Working on chunk:  26
Working on chunk:  27
Working on chunk:  28
Working on chunk:  29
Working on chunk:  30
Working on chunk:  31
Working on chunk:  32
Working on chunk:  33
Working on chunk:  34
Working on chunk:  35
Working on chunk:  36
Working on chunk:  37
Working on chunk:  38
Working on chunk:  39
Working on chunk:  40
Working on chunk:  41
Working on chunk:  42
Working on chunk:  43
Working on chunk:  44
Working on chunk:  4

Working on chunk:  361
Working on chunk:  362
Working on chunk:  363
Working on chunk:  364
Working on chunk:  365
Working on chunk:  366
Working on chunk:  367
Working on chunk:  368
Working on chunk:  369
Working on chunk:  370
Working on chunk:  371
Working on chunk:  372
Working on chunk:  373
Working on chunk:  374
Working on chunk:  375
Working on chunk:  376
Working on chunk:  377
Working on chunk:  378
Working on chunk:  379
Working on chunk:  380
Working on chunk:  381
Working on chunk:  382
Working on chunk:  383
Working on chunk:  384
Working on chunk:  385
Working on chunk:  386
Working on chunk:  387
Working on chunk:  388
Working on chunk:  389
Working on chunk:  390
Working on chunk:  391
Working on chunk:  392
Working on chunk:  393
Working on chunk:  394
Working on chunk:  395
Working on chunk:  396
Working on chunk:  397
Working on chunk:  398
Working on chunk:  399
Working on chunk:  400
Working on chunk:  401
Working on chunk:  402
Working on chunk:  403
Working on 

Working on chunk:  718
Working on chunk:  719
Working on chunk:  720
Working on chunk:  721
Working on chunk:  722
Working on chunk:  723
Working on chunk:  724
Working on chunk:  725
Working on chunk:  726
Working on chunk:  727
Working on chunk:  728
Working on chunk:  729
Working on chunk:  730
Working on chunk:  731
Working on chunk:  732
Working on chunk:  733
Working on chunk:  734
Working on chunk:  735
Working on chunk:  736
Working on chunk:  737
Working on chunk:  738
Working on chunk:  739
Working on chunk:  740
Working on chunk:  741
Working on chunk:  742
Working on chunk:  743
Working on chunk:  744
Working on chunk:  745
Working on chunk:  746
Working on chunk:  747
Working on chunk:  748
Working on chunk:  749
Working on chunk:  750
Working on chunk:  751
Working on chunk:  752
Working on chunk:  753
Working on chunk:  754
Working on chunk:  755
Working on chunk:  756
Working on chunk:  757
Working on chunk:  758
Working on chunk:  759
Working on chunk:  760
Working on 

In [44]:
#add column
y_pred_list = [item for items in y_pred for item in items]
fulldat['prediction'] = y_pred_list

#keep id and prediction and output
output = fulldat[["id", "prediction"]]
output.to_csv("/Users/carlyknight/Dropbox/PROJECTS/Forecasting Downturns/data/text_predictions_8-19-21.csv")
