## Imports

In [19]:
import pandas as pd
import numpy as np
import re

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JakubowskiL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Input

In [20]:
# reading iteratively input files
input_df = pd.read_csv('BBCArticles.csv')
display(input_df.sample(5))
display(input_df['Category'].value_counts())

Unnamed: 0,ArticleId,Text,Category
427,930,casual gaming to take off games aimed at ca...,tech
60,464,dozens held over id fraud site twenty-eight pe...,tech
283,533,england s defensive crisis grows england s def...,sport
841,174,commons hunt protest charges eight protesters ...,politics
913,1792,choose hope over fear - kennedy voters will ha...,politics


sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

## Data Preprocessing

Steps:
1. Cutting docs to words
2. Removing stop-words
3. Lemmatization of words


In [22]:
# splitting and cleaning text
splitted_texts = [doc.split() for doc in input_df['Text']]
input_df['Splitted Text'] = splitted_texts
stripped = []
for i in range(len(input_df['Splitted Text'])):
    strip = [word.strip(' .,)(""-\'') for word in input_df['Splitted Text'][i]]
    stripped.append(strip)
input_df['Stripped Text'] = stripped
input_df.head(5)

Unnamed: 0,ArticleId,Text,Category,Splitted Text,Stripped Text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers..."
1,154,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,1976,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, in, $168m, payout, eighteen, f..."


In [31]:
# removing stop words
filtered = []
stop_words = set(stopwords.words('english'))
for i in range(len(input_df['Stripped Text'])):
    filtered_sentence = [w for w in input_df['Stripped Text'][i] if not w in stop_words]
    filtered.append(filtered_sentence)
input_df['Filtered Text'] = filtered

In [32]:
input_df

Unnamed: 0,ArticleId,Text,Category,Splitted Text,Stripped Text,Filtered Text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,"[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers...","[worldcom, ex-boss, launches, defence, lawyers..."
1,154,german business confidence slides german busin...,business,"[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,...","[german, business, confidence, slides, german,..."
2,1101,bbc poll indicates economic gloom citizens in ...,business,"[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize...","[bbc, poll, indicates, economic, gloom, citize..."
3,1976,lifestyle governs mobile choice faster bett...,tech,"[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b...","[lifestyle, governs, mobile, choice, faster, b..."
4,917,enron bosses in $168m payout eighteen former e...,business,"[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, in, $168m, payout, eighteen, f...","[enron, bosses, $168m, payout, eighteen, forme..."
5,1582,howard truanted to play snooker conservative...,politics,"[howard, truanted, to, play, snooker, conserva...","[howard, truanted, to, play, snooker, conserva...","[howard, truanted, play, snooker, conservative..."
6,651,wales silent on grand slam talk rhys williams ...,sport,"[wales, silent, on, grand, slam, talk, rhys, w...","[wales, silent, on, grand, slam, talk, rhys, w...","[wales, silent, grand, slam, talk, rhys, willi..."
7,1797,french honour for director parker british film...,entertainment,"[french, honour, for, director, parker, britis...","[french, honour, for, director, parker, britis...","[french, honour, director, parker, british, fi..."
8,2034,car giant hit by mercedes slump a slump in pro...,business,"[car, giant, hit, by, mercedes, slump, a, slum...","[car, giant, hit, by, mercedes, slump, a, slum...","[car, giant, hit, mercedes, slump, slump, prof..."
9,1866,fockers fuel festive film chart comedy meet th...,entertainment,"[fockers, fuel, festive, film, chart, comedy, ...","[fockers, fuel, festive, film, chart, comedy, ...","[fockers, fuel, festive, film, chart, comedy, ..."
