In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/willcline/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
path = "../../data/News_Category_Dataset_v2.json"

In [3]:
df = pd.read_json(path,lines=True)

In [4]:
df.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26


In [50]:
df['category'] = df['category'].replace(['CULTURE & ARTS'],'ARTS & CULTURE')

In [5]:
df.shape

(200853, 6)

In [6]:
df.drop(['link', 'date'], axis=1, inplace=True)

In [7]:
df.head(2)

Unnamed: 0,category,headline,authors,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,Of course it has a song.


In [8]:
value_counts = round(df.category.value_counts(normalize=True)*100,2)

In [9]:
value_counts

POLITICS          16.30
WELLNESS           8.88
ENTERTAINMENT      7.99
TRAVEL             4.92
STYLE & BEAUTY     4.80
PARENTING          4.32
HEALTHY LIVING     3.33
QUEER VOICES       3.14
FOOD & DRINK       3.10
BUSINESS           2.96
COMEDY             2.58
SPORTS             2.43
BLACK VOICES       2.25
HOME & LIVING      2.09
PARENTS            1.97
THE WORLDPOST      1.82
WEDDINGS           1.82
WOMEN              1.74
IMPACT             1.72
DIVORCE            1.71
CRIME              1.70
MEDIA              1.40
WEIRD NEWS         1.33
GREEN              1.31
WORLDPOST          1.28
RELIGION           1.27
STYLE              1.12
SCIENCE            1.08
WORLD NEWS         1.08
TASTE              1.04
TECH               1.04
MONEY              0.85
ARTS               0.75
FIFTY              0.70
GOOD NEWS          0.70
ARTS & CULTURE     0.67
ENVIRONMENT        0.66
COLLEGE            0.57
LATINO VOICES      0.56
CULTURE & ARTS     0.51
EDUCATION          0.50
Name: category, 

## By categories

In [67]:
#gets cat df, X, and y
def get_cat(cat, df):
    cat_df = df[df['category']==cat]
    cat_X = df.drop('category', axis=1)
    cat_y = df.category
    return cat_df, cat_X, cat_y

In [68]:
crime_df, crime_X, crime_y = get_cat('CRIME', df)

In [80]:
def cat_eda(df):
    category_list = df['category'].value_counts().index.tolist()
    print(category_list)
    for category in category_list:
        cat_df, cat_X, cat_y = get_cat(category, df)
        
    return cat_df

In [81]:
cat_eda(df)

['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY', 'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK', 'BUSINESS', 'COMEDY', 'SPORTS', 'BLACK VOICES', 'HOME & LIVING', 'PARENTS', 'THE WORLDPOST', 'WEDDINGS', 'WOMEN', 'IMPACT', 'DIVORCE', 'CRIME', 'MEDIA', 'WEIRD NEWS', 'GREEN', 'WORLDPOST', 'RELIGION', 'ARTS & CULTURE', 'STYLE', 'SCIENCE', 'WORLD NEWS', 'TASTE', 'TECH', 'MONEY', 'ARTS', 'FIFTY', 'GOOD NEWS', 'ENVIRONMENT', 'COLLEGE', 'LATINO VOICES', 'EDUCATION']
0                 CRIME
1         ENTERTAINMENT
2         ENTERTAINMENT
3         ENTERTAINMENT
4         ENTERTAINMENT
              ...      
200848             TECH
200849           SPORTS
200850           SPORTS
200851           SPORTS
200852           SPORTS
Name: category, Length: 200853, dtype: object
0                 CRIME
1         ENTERTAINMENT
2         ENTERTAINMENT
3         ENTERTAINMENT
4         ENTERTAINMENT
              ...      
200848             TECH
200849           SPORTS


### Make Target and features DFs

In [10]:
y = df.category

In [11]:
X = df.drop('category', axis=1)

### Get list of stop words

In [12]:
# generate a list of stopwords for TfidfVectorizer to ignore
stopwords_list = stopwords.words('english') + list(string.punctuation)

## EDA

### Basic cleaning

In [36]:
#lower case
df['lower_desc'] = df['headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [37]:
#remove punctuation
df['lower_desc'] = df['lower_desc'].str.replace('[^\w\s]','')

  df['lower_desc'] = df['lower_desc'].str.replace('[^\w\s]','')


In [39]:
#stopword gathering and removal
stop = stopwords.words('english')
df['lower_desc'] = df['lower_desc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

### Tokenize

In [40]:
desc_str = ' '.join(df['lower_desc'].tolist())
print(desc_str)
tokens = nltk.word_tokenize(desc_str) #tokenizing 
print(len(tokens))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



1349444


### Pos tagging

In [None]:
nltk.download('averaged_perceptron_tagger')

In [41]:
tokens_pos = nltk.pos_tag(tokens)
pos_df = pd.DataFrame(tokens_pos, columns = ('word','POS'))
pos_sum = pos_df.groupby('POS', as_index=False).count() # group by POS tags
pos_sum.sort_values(['word'], ascending=[False]) # in descending order of number of words per tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/willcline/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,POS,word
12,NN,491510
8,JJ,245190
15,NNS,199835
30,VBP,99660
28,VBG,54318
31,VBZ,45985
3,CD,43848
19,RB,43363
27,VBD,37770
26,VB,22330


In [42]:
#getting just the nouns
filtered_pos = [ ]
for one in tokens_pos:
    if one[1] == 'NN' or one[1] == 'NNS' or one[1] == 'NNP' or one[1] == 'NNPS':
        filtered_pos.append(one)
print (len(filtered_pos))

693120


In [43]:
#the 100 most common nouns
fdist_pos = nltk.FreqDist(filtered_pos)
top_100_words = fdist_pos.most_common(100)
print(top_100_words)

[(('photos', 'NNS'), 7056), (('trump', 'NN'), 6351), (('day', 'NN'), 4417), (('video', 'NN'), 4220), (('women', 'NNS'), 2845), (('people', 'NNS'), 2785), (('life', 'NN'), 2742), (('kids', 'NNS'), 2661), (('world', 'NN'), 2626), (('time', 'NN'), 2454), (('week', 'NN'), 2376), (('ways', 'NNS'), 2329), (('health', 'NN'), 2326), (('trumps', 'NNS'), 2241), (('things', 'NNS'), 2156), (('house', 'NN'), 2094), (('man', 'NN'), 2044), (('years', 'NNS'), 1945), (('donald', 'NN'), 1928), (('home', 'NN'), 1818), (('way', 'NN'), 1734), (('clinton', 'NN'), 1713), (('bill', 'NN'), 1676), (('food', 'NN'), 1618), (('photo', 'NN'), 1603), (('year', 'NN'), 1533), (('woman', 'NN'), 1485), (('family', 'NN'), 1465), (('tips', 'NNS'), 1426), (('school', 'NN'), 1408), (('president', 'NN'), 1398), (('fashion', 'NN'), 1356), (('news', 'NN'), 1314), (('style', 'NN'), 1290), (('death', 'NN'), 1283), (('state', 'NN'), 1281), (('parents', 'NNS'), 1267), (('marriage', 'NN'), 1211), (('york', 'NN'), 1210), (('study', 

In [44]:
top_words_df = pd.DataFrame(top_100_words, columns = ('pos','count'))
top_words_df['Word'] = top_words_df['pos'].apply(lambda x: x[0]) # split the tuple of POS
top_words_df = top_words_df.drop('pos', 1) # drop the previous column
top_words_df.head(10)

  top_words_df = top_words_df.drop('pos', 1) # drop the previous column


Unnamed: 0,count,Word
0,7056,photos
1,6351,trump
2,4417,day
3,4220,video
4,2845,women
5,2785,people
6,2742,life
7,2661,kids
8,2626,world
9,2454,time


In [None]:
fig, ax = plt.subplots(figsize=(15,18))
top_words_df.sort_values(by='count').plot.barh(x='Word',
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Common Words Found in DS Job Descriptions(Without Stop Words)")

plt.show()

### Vectorize data with tf-idf(one of the 3 vectorizer methods) and make a train test split

In [13]:
# generate tf-idf vectorization (use sklearn's TfidfVectorizer) for our data
def tfidf(X, y,  stopwords_list): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    vectorizer = TfidfVectorizer(stop_words=stopwords_list)
    tf_idf_train = vectorizer.fit_transform(X_train)
    tf_idf_test = vectorizer.transform(X_test)
    return tf_idf_train, tf_idf_test, y_train, y_test, vectorizer

In [14]:
#Call above function to vectorize and train test split data and put them in variables.
#Up here because presumably only have to do it once
idf_train, idf_test, y_tr, y_t, vectorizer = tfidf(X['headline'], y, stopwords_list)

In [15]:
idf_train.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

### Prediction function

In [16]:
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    classifier.fit(tf_idf_train, y_train)
    train_preds = classifier.predict(tf_idf_train)
    test_preds = classifier.predict(tf_idf_test)
    return train_preds, test_preds

### Scoring function to score predictions

In [31]:
def score_preds(y_test,y_train,test_preds, train_preds):
    print("Train: ", accuracy_score(y_train, train_preds))
    print("Test: ", accuracy_score(y_test, test_preds))
#     confusion_matrix(y_test, test_preds)
#     confusion_matrix(y_train, train_preds)

### Make baseline RF and NB models

In [18]:
rfc = RandomForestClassifier(n_estimators=100)
nb_classifier = MultinomialNB()

In [19]:
idf_train.shape

(150639, 49407)

In [23]:
y_t.shape

(50214,)

In [24]:
#Call function using results of vectorize function using NB model
nb_train_preds, nb_test_preds = classify_text(nb_classifier, idf_train, idf_test, y_tr)

In [32]:
#Call function for score
score_preds(y_t, y_tr, nb_test_preds, nb_train_preds)

Train:  0.45857314506867414
Test:  0.41116023419763414


In [25]:
#Call function using results of vectorize function using RF model
rf_train_preds, rf_test_preds = classify_text(rfc, idf_train, idf_test, y_tr)

In [33]:
#Call function for score
score_preds(y_t, y_tr, rf_test_preds, rf_train_preds)

Train:  0.9978425241803252
Test:  0.5324411518699964
