In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from keras.models import Model, load_model
from keras.utils import CustomObjectScope
from keras.optimizers import SGD, Adam

%matplotlib inline

Using TensorFlow backend.


In [2]:
with open('data_final/aws_data/X_train.pkl', 'rb') as infile:
    X_train = pickle.load(infile)
    
with open('data_final/aws_data/X_val.pkl', 'rb') as infile:
    X_val = pickle.load(infile)

with open('data_final/aws_data/X_test.pkl', 'rb') as infile:
    X_test = pickle.load(infile)

with open('data_final/aws_data/y_train.pkl', 'rb') as infile:
    y_train = pickle.load(infile)
    
with open('data_final/aws_data/y_val.pkl', 'rb') as infile:
    y_val = pickle.load(infile)

with open('data_final/aws_data/y_test.pkl', 'rb') as infile:
    y_test = pickle.load(infile)

In [3]:
def recombine(array):
    '''
    Rejoins the lists of words in the articles pre-formatted for training into a single string.
    
    Returns: String containing all the words in an article that was pre-formatted.
    '''
    return [' '.join(' '.join(sent) for sent in array)][0]

In [4]:
data = X_val.apply(recombine)

In [5]:
data = data.append(X_train.apply(recombine))

In [6]:
labels = y_val.append(y_train)

In [7]:
counter = CountVectorizer(decode_error='ignore', strip_accents='unicode', max_features=50000)
counter.fit(data)
data = counter.fit_transform(data)

In [8]:
Cs = list(np.logspace(-4, 1, num=10, endpoint=False))
clf = LogisticRegressionCV(Cs=Cs, solver='sag', cv=5)
tfidf = TfidfTransformer()

In [9]:
lrcv = clf.fit(X=tfidf.fit_transform(data), y=labels)

In [10]:
lrcv.C_

array([ 3.16227766])

In [11]:
#  Since the best value determined for C was the greatest of the values tried, run again with larger values
Cs = list(np.logspace(0, 2, num=6))
clf = LogisticRegressionCV(Cs=Cs, solver='sag', cv=5)
lrcv = clf.fit(X=tfidf.fit_transform(data), y=labels)
lrcv.C_



array([ 100.])

In [12]:
Cs = list(np.logspace(2, 4, num=7))
clf = LogisticRegressionCV(Cs=Cs, solver='sag', cv=5, max_iter=200)
lrcv = clf.fit(X=tfidf.fit_transform(data), y=labels)
lrcv.C_



array([ 100.])

In [13]:
Cs

[100.0,
 215.44346900318845,
 464.15888336127773,
 1000.0,
 2154.4346900318824,
 4641.5888336127773,
 10000.0]

In [15]:
Cs = list(np.linspace(50, 175, num=6))
clf = LogisticRegressionCV(Cs=Cs, solver='sag', cv=5, max_iter=200)
lrcv = clf.fit(X=tfidf.fit_transform(data), y=labels)
lrcv.C_

array([ 125.])

In [17]:
strings = X_test.apply(recombine)
X_test = counter.transform(strings)

In [None]:
lr = LogisticRegression(C=125.0, solver='sag', random_state=77)
lr.