In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/willcline/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
path = "../../data/News_Category_Dataset_v2.json"

In [3]:
df = pd.read_json(path,lines=True)

In [4]:
df.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26


In [99]:
df['all_words'] = df['headline'] + "; " + df['short_description']

In [102]:
df['category'] = df['category'].replace(['CULTURE & ARTS'],'ARTS & CULTURE')

In [103]:
df.shape

(200853, 6)

In [104]:
df.drop(['link', 'date'], axis=1, inplace=True)

KeyError: "['link' 'date'] not found in axis"

In [None]:
df.head(2)

In [None]:
value_counts = round(df.category.value_counts(normalize=True)*100,2)

In [None]:
value_counts

## By categories

In [None]:
#gets cat df, X, and y
def get_cat(cat, df):
    cat_df = df[df['category']==cat]
    cat_X = df.drop('category', axis=1)
    cat_y = df.category
    return cat_df, cat_X, cat_y

In [None]:
crime_df, crime_X, crime_y = get_cat('CRIME', df)

In [105]:
def cat_eda(df):
    category_list = df['category'].value_counts().index.tolist()
    print(category_list)
    for category in category_list:
        cat_df, cat_X, cat_y = get_cat(category, df)
        print(cat_df)
        

In [106]:
cat_eda(df)

['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY', 'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK', 'BUSINESS', 'COMEDY', 'SPORTS', 'BLACK VOICES', 'HOME & LIVING', 'PARENTS', 'THE WORLDPOST', 'WEDDINGS', 'WOMEN', 'IMPACT', 'DIVORCE', 'CRIME', 'MEDIA', 'WEIRD NEWS', 'GREEN', 'WORLDPOST', 'RELIGION', 'ARTS & CULTURE', 'STYLE', 'SCIENCE', 'WORLD NEWS', 'TASTE', 'TECH', 'MONEY', 'ARTS', 'FIFTY', 'GOOD NEWS', 'ENVIRONMENT', 'COLLEGE', 'LATINO VOICES', 'EDUCATION']
        category                                           headline  \
13      POLITICS  Trump's Crackdown On Immigrant Parents Puts Mo...   
14      POLITICS  'Trump's Son Should Be Concerned': FBI Obtaine...   
15      POLITICS  Edward Snowden: There's No One Trump Loves Mor...   
16      POLITICS  Booyah: Obama Photographer Hilariously Trolls ...   
17      POLITICS  Ireland Votes To Repeal Abortion Amendment In ...   
...          ...                                                ...   
12497

### Make Target and features DFs

In [107]:
y = df.category

In [108]:
X = df.drop('category', axis=1)

### Get list of stop words

In [109]:
# generate a list of stopwords for TfidfVectorizer to ignore
stopwords_list = stopwords.words('english') + list(string.punctuation)

## EDA

### Basic cleaning

In [110]:
#lower case
df['lower_desc'] = df['headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [111]:
#remove punctuation
df['lower_desc'] = df['lower_desc'].str.replace('[^\w\s]','')

  df['lower_desc'] = df['lower_desc'].str.replace('[^\w\s]','')


In [112]:
#stopword gathering and removal
stop = stopwords.words('english')
df['lower_desc'] = df['lower_desc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

### Tokenize

In [113]:
desc_str = ' '.join(df['lower_desc'].tolist())
print(desc_str)
tokens = nltk.word_tokenize(desc_str) #tokenizing 
print(len(tokens))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



1349444


### Pos tagging

In [114]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/willcline/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
tokens_pos = nltk.pos_tag(tokens)
pos_df = pd.DataFrame(tokens_pos, columns = ('word','POS'))
pos_sum = pos_df.groupby('POS', as_index=False).count() # group by POS tags
pos_sum.sort_values(['word'], ascending=[False]) # in descending order of number of words per tag

In [None]:
#getting just the nouns
filtered_pos = [ ]
for one in tokens_pos:
    if one[1] == 'NN' or one[1] == 'NNS' or one[1] == 'NNP' or one[1] == 'NNPS':
        filtered_pos.append(one)
print (len(filtered_pos))

In [None]:
#the 100 most common nouns
fdist_pos = nltk.FreqDist(filtered_pos)
top_100_words = fdist_pos.most_common(100)
print(top_100_words)

In [None]:
top_words_df = pd.DataFrame(top_100_words, columns = ('pos','count'))
top_words_df['Word'] = top_words_df['pos'].apply(lambda x: x[0]) # split the tuple of POS
top_words_df = top_words_df.drop('pos', 1) # drop the previous column
top_words_df.head(10)

In [None]:
fig, ax = plt.subplots(figsize=(15,18))
top_words_df.sort_values(by='count').plot.barh(x='Word',
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Common Words Found in DS Job Descriptions(Without Stop Words)")

plt.show()

### Vectorize data with tf-idf(one of the 3 vectorizer methods) and make a train test split

In [None]:
# generate tf-idf vectorization (use sklearn's TfidfVectorizer) for our data
def tfidf(X, y,  stopwords_list): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    vectorizer = TfidfVectorizer(stop_words=stopwords_list)
    tf_idf_train = vectorizer.fit_transform(X_train)
    tf_idf_test = vectorizer.transform(X_test)
    return tf_idf_train, tf_idf_test, y_train, y_test, vectorizer

In [None]:
#Call above function to vectorize and train test split data and put them in variables.
#Up here because presumably only have to do it once
idf_train, idf_test, y_tr, y_t, vectorizer = tfidf(X['headline'], y, stopwords_list)

In [None]:
idf_train.todense()

### Prediction function

In [None]:
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    classifier.fit(tf_idf_train, y_train)
    train_preds = classifier.predict(tf_idf_train)
    test_preds = classifier.predict(tf_idf_test)
    return train_preds, test_preds

### Scoring function to score predictions

In [None]:
def score_preds(y_test,y_train,test_preds, train_preds):
    print("Train: ", accuracy_score(y_train, train_preds))
    print("Test: ", accuracy_score(y_test, test_preds))
#     confusion_matrix(y_test, test_preds)
#     confusion_matrix(y_train, train_preds)

### Make baseline RF and NB models

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
nb_classifier = MultinomialNB()

In [None]:
idf_train.shape

In [None]:
y_t.shape

In [None]:
#Call function using results of vectorize function using NB model
nb_train_preds, nb_test_preds = classify_text(nb_classifier, idf_train, idf_test, y_tr)

In [None]:
#Call function for score
score_preds(y_t, y_tr, nb_test_preds, nb_train_preds)

In [None]:
#Call function using results of vectorize function using RF model
rf_train_preds, rf_test_preds = classify_text(rfc, idf_train, idf_test, y_tr)

In [None]:
#Call function for score
score_preds(y_t, y_tr, rf_test_preds, rf_train_preds)