In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/willcline/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
path = "../../data/News_Category_Dataset_v2.json"

In [3]:
df = pd.read_json(path,lines=True)

In [4]:
df.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26


In [5]:
df['all_words'] = df['headline'] + "; " + df['short_description']

In [6]:
df['category'] = df['category'].replace(['CULTURE & ARTS'],'ARTS & CULTURE')

In [7]:
df.shape

(200853, 7)

In [8]:
df.drop(['link', 'date'], axis=1, inplace=True)

In [9]:
df.head(2)

Unnamed: 0,category,headline,authors,short_description,all_words
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...


In [10]:
value_counts = round(df.category.value_counts(normalize=True)*100,2)

In [11]:
value_counts

POLITICS          16.30
WELLNESS           8.88
ENTERTAINMENT      7.99
TRAVEL             4.92
STYLE & BEAUTY     4.80
PARENTING          4.32
HEALTHY LIVING     3.33
QUEER VOICES       3.14
FOOD & DRINK       3.10
BUSINESS           2.96
COMEDY             2.58
SPORTS             2.43
BLACK VOICES       2.25
HOME & LIVING      2.09
PARENTS            1.97
THE WORLDPOST      1.82
WEDDINGS           1.82
WOMEN              1.74
IMPACT             1.72
DIVORCE            1.71
CRIME              1.70
MEDIA              1.40
WEIRD NEWS         1.33
GREEN              1.31
WORLDPOST          1.28
RELIGION           1.27
ARTS & CULTURE     1.18
STYLE              1.12
SCIENCE            1.08
WORLD NEWS         1.08
TASTE              1.04
TECH               1.04
MONEY              0.85
ARTS               0.75
FIFTY              0.70
GOOD NEWS          0.70
ENVIRONMENT        0.66
COLLEGE            0.57
LATINO VOICES      0.56
EDUCATION          0.50
Name: category, dtype: float64

### Make Target and features DFs

### Get list of stop words

In [14]:
# generate a list of stopwords for TfidfVectorizer to ignore
stopwords_list = stopwords.words('english') + list(string.punctuation)

### Basic cleaning

In [15]:
#lower case
df['lower_desc'] = df['headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [16]:
#remove punctuation
df['lower_desc'] = df['lower_desc'].str.replace('[^\w\s]','')

  df['lower_desc'] = df['lower_desc'].str.replace('[^\w\s]','')


In [17]:
#stopword gathering and removal
stop = stopwords.words('english')
df['lower_desc'] = df['lower_desc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [18]:
X = df.drop('category', axis=1)

In [19]:
y = df.category

### Vectorize data with tf-idf(one of the 3 vectorizer methods) and make a train test split

In [20]:
# generate tf-idf vectorization (use sklearn's TfidfVectorizer) for our data
def tfidf(X, y,  stopwords_list): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    vectorizer = TfidfVectorizer(stop_words=stopwords_list)
    tf_idf_train = vectorizer.fit_transform(X_train)
    tf_idf_test = vectorizer.transform(X_test)
    return tf_idf_train, tf_idf_test, y_train, y_test, vectorizer

In [21]:
#Call above function to vectorize and train test split data and put them in variables.
#Up here because presumably only have to do it once
idf_train, idf_test, y_tr, y_t, vectorizer = tfidf(X['headline'], y, stopwords_list)

In [22]:
idf_train.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

### Prediction function

In [23]:
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    classifier.fit(tf_idf_train, y_train)
    train_preds = classifier.predict(tf_idf_train)
    test_preds = classifier.predict(tf_idf_test)
    return train_preds, test_preds

### Scoring function to score predictions

In [24]:
def score_preds(y_test,y_train,test_preds, train_preds):
    print("Train: ", accuracy_score(y_train, train_preds))
    print("Test: ", accuracy_score(y_test, test_preds))
#     confusion_matrix(y_test, test_preds)
#     confusion_matrix(y_train, train_preds)

### Make baseline RF and NB models

In [25]:
rfc = RandomForestClassifier(n_estimators=100)
nb_classifier = MultinomialNB()

In [26]:
idf_train.shape

(150639, 49407)

In [27]:
y_t.shape

(50214,)

In [28]:
#Call function using results of vectorize function using NB model
nb_train_preds, nb_test_preds = classify_text(nb_classifier, idf_train, idf_test, y_tr)

In [29]:
#Call function for score
score_preds(y_t, y_tr, nb_test_preds, nb_train_preds)

Train:  0.4587855734570729
Test:  0.41131955231608713


In [None]:
#Call function using results of vectorize function using RF model
rf_train_preds, rf_test_preds = classify_text(rfc, idf_train, idf_test, y_tr)

In [None]:
#Call function for score
score_preds(y_t, y_tr, rf_test_preds, rf_train_preds)