In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import contractions
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

In [2]:
df = pd.read_csv('news-data.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [3]:
categories = set(df.category)
print('Categories: {}'.format(categories))
print('No. of Categories: {}'.format(len(categories)))

Categories: {'entertainment', 'business', 'sport', 'tech', 'politics'}
No. of Categories: 5


In [4]:
df.shape

(2225, 2)

In [5]:
df.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [6]:
cat_map = {'sport':0, 'entertainment':1, 'business':2, 'politics':3, 'tech':4}

###  Word Count

In [7]:
df['word_counts'] = df.text.apply(lambda x: len(str(x).split()))

In [8]:
df.head()

Unnamed: 0,category,text,word_counts
0,tech,tv future in the hands of viewers with home th...,737
1,business,worldcom boss left books alone former worldc...,300
2,sport,tigers wary of farrell gamble leicester say ...,246
3,sport,yeading face newcastle in fa cup premiership s...,341
4,entertainment,ocean s twelve raids box office ocean s twelve...,260


### Character count

In [9]:
df['char_counts'] = df.text.apply(lambda x: len(x))
df.head()

Unnamed: 0,category,text,word_counts,char_counts
0,tech,tv future in the hands of viewers with home th...,737,4333
1,business,worldcom boss left books alone former worldc...,300,1842
2,sport,tigers wary of farrell gamble leicester say ...,246,1342
3,sport,yeading face newcastle in fa cup premiership s...,341,2176
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579


### Average word length

In [10]:
def get_avg_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len + len(word)
#     print(word_len)
    return word_len/len(words) # = len(X)/len(words) whitespace excluded

In [11]:
df['avg_word_len'] = df.text.apply(lambda x: get_avg_word_len(x))
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len
0,tech,tv future in the hands of viewers with home th...,737,4333,4.786974
1,business,worldcom boss left books alone former worldc...,300,1842,5.036667
2,sport,tigers wary of farrell gamble leicester say ...,246,1342,4.361789
3,sport,yeading face newcastle in fa cup premiership s...,341,2176,5.240469
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579,4.973077


### Stop words count

In [12]:
df['stop_words_len'] = df.text.apply(lambda x:len([t for t in x.split() if t in STOP_WORDS]))
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len,stop_words_len
0,tech,tv future in the hands of viewers with home th...,737,4333,4.786974,363
1,business,worldcom boss left books alone former worldc...,300,1842,5.036667,122
2,sport,tigers wary of farrell gamble leicester say ...,246,1342,4.361789,139
3,sport,yeading face newcastle in fa cup premiership s...,341,2176,5.240469,82
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579,4.973077,93


### If numeric digits are present 

In [13]:
df['numerics_count'] = df.text.apply(lambda x: len([t for t in x.split() if t.isdigit()]))
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len,stop_words_len,numerics_count
0,tech,tv future in the hands of viewers with home th...,737,4333,4.786974,363,3
1,business,worldcom boss left books alone former worldc...,300,1842,5.036667,122,3
2,sport,tigers wary of farrell gamble leicester say ...,246,1342,4.361789,139,0
3,sport,yeading face newcastle in fa cup premiership s...,341,2176,5.240469,82,0
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579,4.973077,93,1


## Preprocessing and Cleaning

### Lower case conversion

In [14]:
df.text = df.text.apply(lambda x: x.lower())
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len,stop_words_len,numerics_count
0,tech,tv future in the hands of viewers with home th...,737,4333,4.786974,363,3
1,business,worldcom boss left books alone former worldc...,300,1842,5.036667,122,3
2,sport,tigers wary of farrell gamble leicester say ...,246,1342,4.361789,139,0
3,sport,yeading face newcastle in fa cup premiership s...,341,2176,5.240469,82,0
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579,4.973077,93,1


### Contration to Expansion

In [15]:
#Contraction Example
x = "i don't know what you want, can't, he'll, i'd"

In [16]:
contractions.fix(x)

'i do not know what you want, can not, he will, I would'

In [17]:
%%time
df.text = df.text.apply(lambda x: contractions.fix(x))

CPU times: user 491 ms, sys: 0 ns, total: 491 ms
Wall time: 491 ms


In [18]:
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len,stop_words_len,numerics_count
0,tech,tv future in the hands of viewers with home th...,737,4333,4.786974,363,3
1,business,worldcom boss left books alone former worldc...,300,1842,5.036667,122,3
2,sport,tigers wary of farrell gamble leicester say ...,246,1342,4.361789,139,0
3,sport,yeading face newcastle in fa cup premiership s...,341,2176,5.240469,82,0
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579,4.973077,93,1


### Remove multiple spaces 

In [19]:
df['text'] = df.text.apply(lambda x: re.sub('[^A-Z a-z 0-9-]+', '', x))
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len,stop_words_len,numerics_count
0,tech,tv future in the hands of viewers with home th...,737,4333,4.786974,363,3
1,business,worldcom boss left books alone former worldc...,300,1842,5.036667,122,3
2,sport,tigers wary of farrell gamble leicester say ...,246,1342,4.361789,139,0
3,sport,yeading face newcastle in fa cup premiership s...,341,2176,5.240469,82,0
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579,4.973077,93,1


### Special character removal or punctuaion removal

In [20]:
df['text'] = df.text.apply(lambda x: re.sub('[^A-Z a-z 0-9-]+', '', x))
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len,stop_words_len,numerics_count
0,tech,tv future in the hands of viewers with home th...,737,4333,4.786974,363,3
1,business,worldcom boss left books alone former worldc...,300,1842,5.036667,122,3
2,sport,tigers wary of farrell gamble leicester say ...,246,1342,4.361789,139,0
3,sport,yeading face newcastle in fa cup premiership s...,341,2176,5.240469,82,0
4,entertainment,ocean s twelve raids box office ocean s twelve...,260,1579,4.973077,93,1


### Remove HTML tags

In [21]:
from bs4 import BeautifulSoup

In [22]:
%%time
df['text'] = df.text.apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

CPU times: user 612 ms, sys: 14.9 ms, total: 627 ms
Wall time: 635 ms


### Remove Accented Chars

In [23]:
import unicodedata

In [24]:
x = 'fiancé, résumé, El Niño, déjà vu.'

In [25]:
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

In [26]:
remove_accented_chars(x)

'fiance, resume, El Nino, deja vu.'

In [27]:
df['text'] = df.text.apply(lambda x: remove_accented_chars(x))

### Remove Stopwords

In [28]:
x = 'this is a stop words removal code'
' '.join([t for t in x.split() if t not in STOP_WORDS])

'stop words removal code'

In [29]:
df['text'] = df.text.apply(lambda x: ' '.join([t for t in x.split() if t not in STOP_WORDS]))
df.head()

Unnamed: 0,category,text,word_counts,char_counts,avg_word_len,stop_words_len,numerics_count
0,tech,tv future hands viewers home theatre systems p...,737,4333,4.786974,363,3
1,business,worldcom boss left books worldcom boss bernie ...,300,1842,5.036667,122,3
2,sport,tigers wary farrell gamble leicester rushed ma...,246,1342,4.361789,139,0
3,sport,yeading face newcastle fa cup premiership newc...,341,2176,5.240469,82,0
4,entertainment,ocean s raids box office ocean s crime caper s...,260,1579,4.973077,93,1


### Convert to base form

In [30]:
nlp = spacy.load('en_core_web_sm')

In [31]:
def make_to_base(x):
    x_list = []
    doc = nlp(x)
    
    for token in doc:
        lemma = str(token.lemma_)
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text
        x_list.append(lemma)
    return ' '.join(x_list)

In [32]:
x = 'kenichan dived times ball managed save 50 rest bounds'

In [33]:
make_to_base(x)

'kenichan dive times ball manage save 50 rest bound'

In [34]:
df['text'] = df.text.apply(lambda x: make_to_base(x))

KeyboardInterrupt: 

In [None]:
df.head()

### Word Cloud Visualization

In [None]:
from wordcloud import WordCloud
%matplotlib inline

In [None]:
text = ' '.join(df['text'])

In [None]:
text

In [None]:
wc = WordCloud(width=800, height=500).generate(text)

In [None]:
plt.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
df.head()

In [None]:
dfr = df.copy()

In [None]:
dfr.head()

In [None]:
def convert_category_to_num(x):
    return cat_map[x]

In [None]:
dfr['category'] = dfr.category.apply(lambda x: convert_category_to_num(x))
y = dfr.iloc[:, 0:1]

In [None]:
x = dfr.iloc[:, 1:2]

In [None]:
dfr.head()

In [None]:
dfr.category.value_counts()

In [None]:
dfr_normalized0 = dfr[dfr['category'] == 0].sample(n=386, random_state=1)
dfr_normalized1 = dfr[dfr['category'] == 1].sample(n=386, random_state=1)
dfr_normalized2 = dfr[dfr['category'] == 2].sample(n=386, random_state=1)
dfr_normalized3 = dfr[dfr['category'] == 3].sample(n=386, random_state=1)
dfr_normalized4 = dfr[dfr['category'] == 4].sample(n=386, random_state=1)

In [None]:
dfr_normalized = pd.concat([dfr_normalized0, dfr_normalized1, dfr_normalized2, dfr_normalized3, dfr_normalized4])

In [None]:
dfr_normalized.category.value_counts()

In [None]:
dfr_normalized = dfr_normalized.sample(frac=1)

In [None]:
dfr_normalized.head()

In [None]:
X = dfr_normalized.iloc[:, 1:]
y = dfr_normalized.iloc[:, 0:1]

### Bag of Words(BoG)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()
text_counts = cv.fit_transform(X['text'])

In [None]:
text_counts.toarray().shape

In [None]:
dfr_bog = pd.DataFrame(text_counts.toarray(), columns=cv.get_feature_names())

In [None]:
dfr_bog.head(2)

### ML Algorithms

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [None]:
sgd = SGDClassifier(n_jobs=-1, random_state=42, max_iter=200)
lgr = LogisticRegression(random_state=42, max_iter=200)
lgr_cv = LogisticRegressionCV(random_state=42, max_iter=1000, cv=2)
svm = LinearSVC(random_state=42, max_iter=200)
rfc = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators = 200)

In [None]:
clf = {'SGD':sgd, 'LGR':lgr, 'LGR-CV':lgr_cv, 'SVM':svm, 'RFC':rfc}

In [None]:
def classify(X, y):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
    
    for key in clf.keys():
        clf[key].fit(X_train, y_train)
        y_pred = clf[key].predict(X_test)
        ac = accuracy_score(y_test, y_pred)
        cr = classification_report(y_test, y_pred)
        print(key, ' -------------> ', ac)
        print('classification_report -------------> ')
        print(cr)

In [None]:
%%time
classify(dfr_bog, y)

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()
dfr_idf = tfidf.fit_transform(X['text'])

In [None]:
dfr_idf

In [None]:
%%time
classify(pd.DataFrame(dfr_idf.toarray()), y)

### Word2Vec

In [None]:
def get_vec(x):
    doc = nlp(x)
    return doc.vector.reshape(1, -1)

In [None]:
%%time
X['vector'] = X.text.apply(lambda x:get_vec(x))

In [None]:
X.head()

In [None]:
dfr_word2vec = np.concatenate(X['vector'].to_numpy(), axis=0)

In [None]:
dfr_word2vec

In [None]:
dfr_word2vec.shape

In [None]:
classify(pd.DataFrame(dfr_word2vec), y)

### Load Pretrained Model

In [None]:
samples = dfr_normalized.iloc[:, 1:2]
y = dfr_normalized.iloc[:, 0:1]

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)