In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import datetime as dt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re
import pickle
from scipy.sparse import hstack


from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, make_scorer

import os
print(os.listdir("../input/"))

# set seed for reproducibility
np.random.seed(0)

['__output__.json', 'newsdata.csv']


In [3]:
# Load Data and also parse the dates
print("Loading data...")
train = pd.read_csv('../input/newsdata.csv', parse_dates=["date"])
print("Train shape:", train.shape)

Loading data...
Train shape: (124989, 6)


In [4]:
# check data types of the object

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124989 entries, 0 to 124988
Data columns (total 6 columns):
authors              110838 non-null object
category             124989 non-null object
date                 124989 non-null datetime64[ns]
headline             124983 non-null object
link                 124989 non-null object
short_description    105399 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 5.7+ MB


In [5]:
train.head()

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."


In [6]:
train['category'].value_counts()

POLITICS          32739
ENTERTAINMENT     14257
HEALTHY LIVING     6694
QUEER VOICES       4995
BUSINESS           4254
SPORTS             4167
COMEDY             3971
PARENTS            3955
BLACK VOICES       3858
THE WORLDPOST      3664
WOMEN              3490
CRIME              2893
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
IMPACT             2602
WORLDPOST          2579
RELIGION           2556
STYLE              2254
WORLD NEWS         2177
TRAVEL             2145
TASTE              2096
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
SCIENCE            1381
ARTS & CULTURE     1339
TECH               1231
COLLEGE            1144
LATINO VOICES      1129
EDUCATION          1004
Name: category, dtype: int64

In [33]:
%env JOBLIB_TEMP_FOLDER=/tmp
# Check the first headline

print('The first headline is:\n\n',train["headline"][0])
# Check the first short description
print('The first description is:\n\n',train["short_description"][0])

env: JOBLIB_TEMP_FOLDER=/tmp
The first headline is:

 2 mass shoot texa last week 1 tv
The first description is:

 left husband kill children anoth day america


## Feature Engineering

In [8]:
# extracting datetime features using datetime module
# Value of year
train["Year"] = train["date"].dt.year

# Month number (values between 1 to 12) 
train["Month"] = train["date"].dt.month

# Week Day value (values between 1 to 7)
train['Weekday'] = train['date'].dt.weekday

# Value of time hour (values between 0 to 23)
train["Hour"] = train['date'].dt.hour

# Year Day (values between 1 to 365)
train["Month_Day"] = train['date'].dt.day

# Month Day (values between 1 to 31)
train["Year_Day"] = train['date'].dt.dayofyear

In [9]:
train.head(2)

Unnamed: 0,authors,category,date,headline,link,short_description,Year,Month,Weekday,Hour,Month_Day,Year_Day
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018,5,5,0,26,146
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018,5,5,0,26,146


In [10]:
# Drop date column
col = ['date','link']
train = train.drop(col,axis = 1)

In [11]:
train.head(2)

Unnamed: 0,authors,category,headline,short_description,Year,Month,Weekday,Hour,Month_Day,Year_Day
0,Melissa Jeltsen,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...,2018,5,5,0,26,146
1,Andy McDonald,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.,2018,5,5,0,26,146


In [12]:
missing_df = train.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
missing_df

Unnamed: 0,column_name,missing_count
2,headline,6
0,authors,14151
3,short_description,19590


Let's get rid of them. 

In [13]:
train = train.drop_duplicates() # Drop duplicates rows

In [14]:
train = train.dropna() # Drop rows containing Nan values

In [15]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [16]:
# clean description
train['headline'] = train['headline'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))
train['short_description'] = train['short_description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [17]:
# label encode categorical features in data given
cols = ['authors','category']

for x in cols:
    lbl = LabelEncoder()
    train[x] = lbl.fit_transform(train[x])

In [18]:
train['authors'].max()

18356

In [19]:
train['category'].max()

30

In [20]:
y = train['category']

In [21]:
features_df = train[['authors', 'Year','Month', 'Weekday', 'Hour', 'Month_Day', 'Year_Day']]

In [22]:
scorer = make_scorer(accuracy_score, greater_is_better=True)

In [23]:
import warnings 
from sklearn.exceptions import ConvergenceWarning

# Filter out warnings from models
warnings.filterwarnings('ignore', category = ConvergenceWarning)
warnings.filterwarnings('ignore', category = DeprecationWarning)
warnings.filterwarnings('ignore', category = UserWarning)

# Dataframe to hold results
model_results = pd.DataFrame(columns = ['model', 'cv_mean', 'cv_std'])

def cv_model(train, train_labels, model, name, model_results=None):
    """Perform 3 fold cross validation of a model"""
    cv_scores = cross_val_score(model, train, train_labels, cv = 3, scoring=scorer, n_jobs = -1)
    print(f'3 Fold CV Score: {round(cv_scores.mean(), 5)} with std: {round(cv_scores.std(), 5)}')
    
    if model_results is not None:
        model_results = model_results.append(pd.DataFrame({'model': name, 
                                                           'cv_mean': cv_scores.mean(), 
                                                            'cv_std': cv_scores.std()},
                                                           index = [0]),
                                             ignore_index = True)

        return model_results

In [24]:
# Bag of Words (word based) for headline
ctv_word0 = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df = 200, max_features=5000,
            ngram_range=(1,2), stop_words = 'english')

# Fitting CountVectorizer to training sets
ctv_word0.fit(list(train['headline']))
ctv_word_headline =  ctv_word0.transform(train['headline'])

In [25]:
# Bag of Words (word based) for short description
ctv_word1 = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df = 200, max_features=10000,
            ngram_range=(2,6), stop_words = 'english')

# Fitting CountVectorizer to training sets
ctv_word1.fit(list(train['short_description']))
ctv_word_desc =  ctv_word1.transform(train['short_description'])

In [26]:
# TF - IDF (words) for headline

tfv_word0 = TfidfVectorizer(min_df=150,  max_features= 5000, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,2),
            stop_words = 'english')

# Fitting TF-IDF to training set
tfv_word0.fit(list(train['headline']))
tfv_word_headline =  tfv_word0.transform(train['headline'])

In [27]:
# TF - IDF (words) for short description

tfv_word1 = TfidfVectorizer(min_df=150,  max_features= 5000, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,2),
            stop_words = 'english')

# Fitting TF-IDF to training sets
tfv_word1.fit(list(train['short_description']))
tfv_word_desc =  tfv_word1.transform(train['short_description'])

In [28]:
# bag of words for training set (headline + description)
train_bow = hstack([ctv_word_headline, ctv_word_desc])
# create dataframe for features
bow_df = pd.DataFrame(train_bow.todense())

In [29]:
# TF - IDF for training set (headline + description)
train_tfidf = hstack([tfv_word_desc,tfv_word_headline])
# create dataframe for features
tfidf_df = pd.DataFrame(train_tfidf.todense())

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score,GridSearchCV

# Model imports
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier

import xgboost as xgb

In [31]:
model_results = cv_model(features_df, y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.01975 with std: 0.0024


In [34]:
model_results = cv_model(ctv_word_headline.toarray(), y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.42638 with std: 0.02122


In [35]:
model_results = cv_model(ctv_word_desc.toarray(), y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.27851 with std: 0.00021


In [36]:
model_results = cv_model(bow_df, y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.42696 with std: 0.02314


In [37]:
model_results = cv_model(tfv_word_desc.toarray(), y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.36787 with std: 0.01543


In [38]:
model_results = cv_model(tfv_word_desc.toarray(), y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.36787 with std: 0.01543


In [39]:
model_results = cv_model(bow_df, y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.42696 with std: 0.02314


In [40]:
model_results = cv_model(tfidf_df, y, 
                         MultinomialNB(), 'GNB', model_results)

3 Fold CV Score: 0.50269 with std: 0.01086


Yeah, so we achieved **50% accuracy with TF - IDF  combined on description and headline text features**. 