In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


In [2]:
data = pd.read_csv('../datasets/ag_news_train.csv')

In [3]:
data.head()

Unnamed: 0,label,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [4]:
data.shape

(120000, 3)

In [6]:
data.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
119995    False
119996    False
119997    False
119998    False
119999    False
Length: 120000, dtype: bool

In [7]:
data.duplicated().sum()

np.int64(0)

In [8]:
data.isnull()

Unnamed: 0,label,title,description
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
119995,False,False,False
119996,False,False,False
119997,False,False,False
119998,False,False,False


In [9]:
data.isnull().sum()

label          0
title          0
description    0
dtype: int64

In [10]:
import re
import string

In [11]:
data["title"].head(5)

0    Wall St. Bears Claw Back Into the Black (Reuters)
1    Carlyle Looks Toward Commercial Aerospace (Reu...
2      Oil and Economy Cloud Stocks' Outlook (Reuters)
3    Iraq Halts Oil Exports from Main Southern Pipe...
4    Oil prices soar to all-time record, posing new...
Name: title, dtype: object

In [13]:
# convert uppercase to lowercase

data["title"] = data["title"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [14]:
data["title"].head(5)

0    wall st. bears claw back into the black (reuters)
1    carlyle looks toward commercial aerospace (reu...
2      oil and economy cloud stocks' outlook (reuters)
3    iraq halts oil exports from main southern pipe...
4    oil prices soar to all-time record, posing new...
Name: title, dtype: object

In [15]:
data["description"].head(5)

0    Reuters - Short-sellers, Wall Street's dwindli...
1    Reuters - Private investment firm Carlyle Grou...
2    Reuters - Soaring crude prices plus worries\ab...
3    Reuters - Authorities have halted oil export\f...
4    AFP - Tearaway world oil prices, toppling reco...
Name: description, dtype: object

In [16]:
data["description"] = data["description"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [17]:
data["description"].head(5)

0    reuters - short-sellers, wall street's dwindli...
1    reuters - private investment firm carlyle grou...
2    reuters - soaring crude prices plus worries\ab...
3    reuters - authorities have halted oil export\f...
4    afp - tearaway world oil prices, toppling reco...
Name: description, dtype: object

In [19]:
#remove unwanted links in the text data

data["title"] = data["title"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','', x, flags=re.MULTILINE) for x in x.split()))

data["description"] = data["description"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','', x, flags=re.MULTILINE) for x in x.split()))

In [20]:
#remove punctuations in the text data

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,'')
    return text
data["title"] = data["title"].apply(remove_punctuations)

In [21]:
data["title"].head(10)

0       wall st bears claw back into the black reuters
1    carlyle looks toward commercial aerospace reuters
2         oil and economy cloud stocks outlook reuters
3    iraq halts oil exports from main southern pipe...
4    oil prices soar to alltime record posing new m...
5             stocks end up but near year lows reuters
6                   money funds fell in latest week ap
7    fed minutes show dissent over inflation usatod...
8                                 safety net forbescom
9               wall st bears claw back into the black
Name: title, dtype: object

In [22]:
#remove numbers in the data set

data["title"] = data["title"].str.replace('\d+', '', regex=True)

In [23]:
data["title"].tail(10)

119990              barack obama gets  million book deal ap
119991              rauffer beats favorites to win downhill
119992          iraqis face winter shivering by candlelight
119993    au says sudan begins troop withdrawal from darfur
119994      syria redeploys some security forces in lebanon
119995     pakistans musharraf says wont quit as army chief
119996                     renteria signing a topshelf deal
119997                      saban not going to dolphins yet
119998                                     todays nfl games
119999                         nets get carter from raptors
Name: title, dtype: object

In [28]:
#remove stopwords from the data set 

!pip install nltk

import nltk



In [30]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
with open ('../static/model/corpora/stopwords/english', 'r')as file:
    sw = file.read().splitlines()

In [32]:
sw

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [33]:
data["title"] = data["title"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [34]:
data["title"].head(10)

0                wall st bears claw back black reuters
1    carlyle looks toward commercial aerospace reuters
2             oil economy cloud stocks outlook reuters
3    iraq halts oil exports main southern pipeline ...
4    oil prices soar alltime record posing new mena...
5                    stocks end near year lows reuters
6                      money funds fell latest week ap
7       fed minutes show dissent inflation usatodaycom
8                                 safety net forbescom
9                        wall st bears claw back black
Name: title, dtype: object

In [38]:
#stemming - turning words like "going" to "go" 

from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [39]:
data["title"] = data["title"].apply(lambda x: " ".join(ps.stem(x)for x in x.split()))

In [40]:
data["title"].head(10)

0                  wall st bear claw back black reuter
1          carlyl look toward commerci aerospac reuter
2               oil economi cloud stock outlook reuter
3    iraq halt oil export main southern pipelin reuter
4    oil price soar alltim record pose new menac us...
5                       stock end near year low reuter
6                       money fund fell latest week ap
7            fed minut show dissent inflat usatodaycom
8                                 safeti net forbescom
9                         wall st bear claw back black
Name: title, dtype: object

In [41]:
#building a vacabulary