# Clustering News
### This dataset has been downloaded from https://www.kaggle.com/rmisra/news-category-dataset.
### Our goal is to categorize news articles based on their headlines and short descriptions

In [1]:
# Importing libs

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
#Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# K-Means
from sklearn.cluster import KMeans
#confusion Matrix
from sklearn.metrics import confusion_matrix
# Score
from sklearn.metrics import classification_report

In [2]:
#Load the dataset
news = pd.read_csv('/media/sf_FormacaoCientistaDeDados/Portfolio/News_Category/news-category-dataset/news_dataset.csv')

In [3]:
# top 5 rows
news.head()

Unnamed: 0.1,Unnamed: 0,authors,category,date,headline,link,short_description,text
0,0,,WORLD NEWS,2018-05-26,South Korean President Meets North Korea's Kim...,https://www.huffingtonpost.com/entry/south-kor...,The two met to pave the way for a summit betwe...,South Korean President Meets North Korea's Kim...
1,1,David Moye,WEIRD NEWS,2018-05-26,Weird Father's Day Gifts Your Dad Doesn't Know...,https://www.huffingtonpost.com/entry/weird-fat...,Why buy a boring tie when you can give him tes...,Weird Father's Day Gifts Your Dad Doesn't Know...
2,2,Hilary Hanson,WEIRD NEWS,2018-05-26,Mystery 'Wolf-Like' Animal Reportedly Shot In ...,https://www.huffingtonpost.com/entry/montana-w...,“We have no idea what this was until we get a ...,Mystery 'Wolf-Like' Animal Reportedly Shot In ...
3,3,"Josh Smith and Christine Kim, Reuters",WORLD NEWS,2018-05-25,North Korea Still Open To Talks After Trump Ca...,https://www.huffingtonpost.com/entry/north-kor...,Trump’s announcement came after repeated threa...,North Korea Still Open To Talks After Trump Ca...
4,4,,WORLD NEWS,2018-05-25,2 Men Detonate Bomb Inside Indian Restaurant N...,https://www.huffingtonpost.com/entry/mississau...,"Fifteen people were taken to the hospital, thr...",2 Men Detonate Bomb Inside Indian Restaurant N...


### For this model, we've using only "category" and "text" columns. 

In [4]:
#Drop columns
news.drop(columns=['authors','date','headline','link','short_description','Unnamed: 0'],inplace=True)

In [5]:
#Checking missing values, columns and rowcount
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21450 entries, 0 to 21449
Data columns (total 2 columns):
category    21450 non-null object
text        21450 non-null object
dtypes: object(2)
memory usage: 335.2+ KB


In [6]:
#grouping text by category
news.groupby(['category']).count()

Unnamed: 0_level_0,text
category,Unnamed: 1_level_1
GREEN,2622
MEDIA,2815
RELIGION,2556
SCIENCE,2178
STYLE,2254
TASTE,2096
TECH,2082
WEIRD NEWS,2670
WORLD NEWS,2177


In [7]:
# shape of the dataset
news.shape

(21450, 2)

## Preprocessing
#### We'll to turn "text" feature into word vectors

In [8]:
# Function for preprocessing
def preprocessing(text):
    #removing the punctuation
    text_v2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
    #tokenizes sentences into words
    tokens = [word for sent in nltk.sent_tokenize(text_v2) for word in nltk.word_tokenize(sent)]
    #lowercase
    tokens =[word.lower() for word in tokens]
    #Removing stopwords
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    #Removing words lenght <  3
    tokens = [token for token in tokens if len(tokens) >=3 ]
    #Stemming ("eating" into "eat")
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    #POS tagging ( tags to words , ex running is verb)
    tagged_corpus = pos_tag(tokens)
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
    #lemmatizer (brings down word to root word, ex ate into eat, but considering tagging)
    #Source: https://en.wikipedia.org/wiki/Lemmatisation
    lemmatizer = WordNetLemmatizer()
    
    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    
    pre_proc_text = ' '.join([prat_lemmatize(token,tag)for token ,tag in tagged_corpus])
    
    return pre_proc_text


# References: Statistics for Machine Learning,2017, p 212 ,author Pratap Dangeti

    

In [9]:
# Applies the preprocessing function
news_normalized = []
for r in news['text']:
    news_normalized.append(preprocessing(r))
    

In [10]:
#split train and test dataset
x_train, x_test, y_train, y_test = train_test_split(news_normalized, news['category'])

In [11]:
#building TFIDF Vectorizer
# min_df = ignore terms lower than the given threshould 
# strip_accents = remove accents and perform other normalization
# ngram_range = sets of consecutive words ex
vectorizer = TfidfVectorizer(min_df=3 ,stop_words ='english',strip_accents='unicode',ngram_range=(1,2))#,norm='l1')
x_train_2 = vectorizer.fit_transform(x_train).todense()

## Naive Bayes

In [12]:
# Training the model
nb = MultinomialNB().fit(x_train_2,y_train)
#Predict
nb_predicted = nb.predict(x_train_2)


In [13]:
# Evaluate test dataset 
x_test_2 = vectorizer.transform(x_test).todense() 
nb_predicted_test = nb.predict(x_test_2)

In [14]:
#confusion_matrix 
pd.crosstab(y_test, nb_predicted_test,rownames =["Actuall"],colnames = ["Predicted"])

Predicted,GREEN,MEDIA,RELIGION,SCIENCE,STYLE,TASTE,TECH,WEIRD NEWS,WORLD NEWS
Actuall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GREEN,539,15,7,20,5,15,2,71,9
MEDIA,14,627,22,5,2,2,24,18,14
RELIGION,20,28,530,10,4,6,4,15,14
SCIENCE,65,14,25,370,11,8,10,29,0
STYLE,15,27,13,8,443,16,8,41,3
TASTE,15,12,9,7,15,416,9,27,1
TECH,16,39,11,12,11,4,371,28,7
WEIRD NEWS,42,37,19,22,16,28,13,441,6
WORLD NEWS,54,32,56,5,2,0,10,19,403


In [15]:
# score on test dataset
print(classification_report(y_test, nb_predicted_test))

             precision    recall  f1-score   support

      GREEN       0.69      0.79      0.74       683
      MEDIA       0.75      0.86      0.80       728
   RELIGION       0.77      0.84      0.80       631
    SCIENCE       0.81      0.70      0.75       532
      STYLE       0.87      0.77      0.82       574
      TASTE       0.84      0.81      0.83       511
       TECH       0.82      0.74      0.78       499
 WEIRD NEWS       0.64      0.71      0.67       624
 WORLD NEWS       0.88      0.69      0.78       581

avg / total       0.78      0.77      0.77      5363



## K-Means

In [16]:
# K-means
km = KMeans(n_clusters = 9,init= 'random',n_init=1,verbose = 1)
km.fit(x_train_2)

Initialization complete
start iteration
done sorting
end inner loop


MemoryError: 

In [29]:
x_train_2

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
km.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [43]:
km.labels_.shape

(7243,)

In [44]:
# Centroids
km.cluster_centers_

array([[  1.10199847e-04,   2.55529676e-03,   1.67927139e-04, ...,
          2.41521866e-04,   9.00642152e-04,   5.82826854e-05],
       [ -5.14996032e-19,   1.64798730e-17,   1.62630326e-19, ...,
         -5.96311195e-19,   1.57353824e-03,  -8.13151629e-19],
       [  2.98155597e-19,  -2.16840434e-18,   0.00000000e+00, ...,
         -4.33680869e-19,  -8.67361738e-19,   1.35525272e-19],
       [  2.39945655e-04,   4.27605676e-03,   7.42743450e-04, ...,
          9.67587611e-04,   3.03043404e-04,   5.94420561e-04]])

In [45]:
# Confusion matrix
print(pd.crosstab(y_train, km.labels_,rownames = ["Actuall"],colnames = ["Predicted"]) )

Predicted      0    1   2    3
Actuall                       
RELIGION    1650   38  99  115
SCIENCE      801   56   0  759
STYLE       1341  235   0  127
WEIRD NEWS  1830   69   0  123
