In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

In [7]:


df = pd.read_csv('dffulltext.csv')

In [8]:
df['target'] = df['category'].replace(['the worldpost', 'worldpost'],'world news')


In [9]:
df['target'] = df['target'].replace(['black voices', 'queer voices', 'latino voices', 'women'],'diverse voices')

In [10]:
from ratelimit import limits, RateLimitException
from backoff import on_exception, expo
import requests

In [11]:
from newspaper.article import ArticleException

In [12]:
df = df.dropna()

In [13]:
#df2k['text'] = df2k['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#df['text'].head()

df['lower_text'] = df['full_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [14]:


#remove punctuation
df.lower_text = df.lower_text.str.replace('[^\w\s]','')



In [15]:
import nltk
import string
import re 
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnsimmons/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
#stopword gathering and removal
stop = stopwords.words('english')
df['lower_text'] = df['lower_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))



In [17]:
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnsimmons/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:

def clean_history(history):
    history = "".join([word.lower() for word in history if word not in string.punctuation])
    tokens = re.split('\W+', history)
    history = [wn.lemmatize(word) for word in tokens if word not in stop]
    history = ' '.join(history)
    return history
df['text_clean'] = df['lower_text'].apply(lambda x: clean_history(x.lower()))


In [21]:
df.text_clean.apply(lambda x: len(x.split()))

0        1380
1          56
2         107
3         122
4          43
         ... 
40378     214
40379     281
40380     525
40381     509
40382     238
Name: text_clean, Length: 37747, dtype: int64

In [22]:
y = df.target
X = df.text_clean

In [23]:
# generate tf-idf vectorization (use sklearn's TfidfVectorizer) for our data
def tfidf(X, y,  stopwords_list): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    vectorizer = TfidfVectorizer(stop_words=stopwords_list)
    tf_idf_train = vectorizer.fit_transform(X_train)
    tf_idf_test = vectorizer.transform(X_test)
    return tf_idf_train, tf_idf_test, y_train, y_test, vectorizer


In [24]:
stopwords_list = stopwords.words('english')
idf_train, idf_test, y_tr, y_t, vectorizer = tfidf(X, y, stopwords_list)

In [25]:
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    classifier.fit(tf_idf_train, y_train)
    train_preds = classifier.predict(tf_idf_train)
    test_preds = classifier.predict(tf_idf_test)
    return train_preds, test_preds

def score_preds(y_test,y_train,test_preds, train_preds):
    print("Train Acc: ", accuracy_score(y_train, train_preds))
    print("Test Acc: ", accuracy_score(y_test, test_preds))
#     confusion_matrix(y_test, test_preds)

In [26]:
rfc = RandomForestClassifier(class_weight='balanced', n_estimators=50, max_depth=50, n_jobs=-1)
nb_classifier = MultinomialNB()

In [27]:
nb_train_preds, nb_test_preds = classify_text(nb_classifier, idf_train, idf_test, y_tr)

In [28]:
#Call function for score
score_preds(y_t, y_tr, nb_test_preds, nb_train_preds)

Train Acc:  0.40095372659837514
Test Acc:  0.4000211931757974


In [29]:


#Call function using results of vectorize function using RF model
rf_train_preds, rf_test_preds = classify_text(rfc, idf_train, idf_test, y_tr)

#Call function for score
score_preds(y_t, y_tr, rf_test_preds, rf_train_preds)

Train Acc:  0.9600494524902862
Test Acc:  0.6048532372576031


In [30]:

import gensim 
import gensim.downloader as api 



In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [32]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   window=5,
                                   min_count=2)

Updating saved json file


In [33]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(df['target'])
encoded_Y = encoder.transform(df['target'])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [34]:
rnn_X_train, rnn_X_test, rnn_y_train, rnn_y_test = train_test_split(df['text_clean'],
                                                                    dummy_y)

In [35]:
from keras.preprocessing.text import Tokenizer #clean and tokenize the data 
from keras.preprocessing.sequence import pad_sequences
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(rnn_X_train)

In [36]:
X_train_seq = tokenizer.texts_to_sequences(rnn_X_train)
X_test_seq = tokenizer.texts_to_sequences(rnn_X_test)

In [40]:
tokens = [tokenizer.texts_to_sequences(rnn_X_train[i]) for i in rnn_X_train.index]

In [41]:
from nltk.probability import FreqDist

freqdist = FreqDist()


In [38]:
# bag = ''

# for val in rnn_X_train.values:
#     bag = bag + ' ' + val

In [389]:
# Pad the sequences so each sequence is the same length

X_train_seq_padded = pad_sequences(X_train_seq, 6185) 
X_test_seq_padded = pad_sequences(X_test_seq, 6185) 

In [390]:
# Import the tools needed from keras and define functions to calculate recall and precision
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [391]:
len(df.target.unique())

25

In [403]:
# Construct a simple RNN model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(32, dropout=.2, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
#model.add(Dense(8, activation='softmax'))
model.add(Dense(16, activation='relu'))
model.add(Dense(25, activation='sigmoid')) 
model.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, None, 32)          6127520   
_________________________________________________________________
lstm_25 (LSTM)               (None, 32)                8320      
_________________________________________________________________
dense_74 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_75 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_76 (Dense)             (None, 25)                425       
Total params: 6,137,849
Trainable params: 6,137,849
Non-trainable params: 0
_________________________________________________________________


In [404]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', precision_m, recall_m])

In [408]:
# Fit the RNN model
history = model.fit(X_train_seq, rnn_y_train, 
                    batch_size=16, epochs=10,
                    validation_data=(X_test_seq, rnn_y_test))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [44]:
import pickle

In [45]:
with open('vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer, f, protocol = pickle.HIGHEST_PROTOCOL)