# AA Assignment 

## 1. Imports

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import string 
from nltk.corpus import stopwords
wn = nltk.WordNetLemmatizer()

## 2. Read Data

In [2]:
pd.set_option('display.max_colwidth', 100)
dat = pd.read_csv('bbc-text.csv')
dat.head(10)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home theatre systems plasma high-definition tvs and dig...
1,business,worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseein...
2,sport,tigers wary of farrell gamble leicester say they will not be rushed into making a bid for andy...
3,sport,yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier ...
4,entertainment,ocean s twelve raids box office ocean s twelve the crime caper sequel starring george clooney ...
5,politics,howard hits back at mongrel jibe michael howard has said a claim by peter hain that the tory lea...
6,politics,blair prepares to name poll date tony blair is likely to name 5 may as election day when parliam...
7,sport,henman hopes ended in dubai third seed tim henman slumped to a straight sets defeat in his rain-...
8,sport,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited retur...
9,entertainment,last star wars not for children the sixth and final star wars movie may not be suitable for yo...


In [3]:
dat['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [4]:
print(dat['category'].isnull().any())
print(dat['text'].isnull().any())

False
False


## 3. Preprocess Data

In [5]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W|_)+"," ",text)
    
    return text

def get_stop_words(stop_file_path):
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("stopwords.txt")

cleaned_text = dat['text'].apply(lambda x:pre_process(x))

In [6]:
cleaned_text.head(10)

0    tv future in the hands of viewers with home theatre systems plasma high definition tvs and digit...
1    worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseeing a...
2    tigers wary of farrell gamble leicester say they will not be rushed into making a bid for andy f...
3    yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier ...
4    ocean s twelve raids box office ocean s twelve the crime caper sequel starring george clooney br...
5    howard hits back at mongrel jibe michael howard has said a claim by peter hain that the tory lea...
6    blair prepares to name poll date tony blair is likely to name may as election day when parliamen...
7    henman hopes ended in dubai third seed tim henman slumped to a straight sets defeat in his rain ...
8    wilkinson fit to face edinburgh england captain jonny wilkinson will make his long awaited retur...
9    last star wars not for children the sixth and fina

## 4. Transform Data

### 4.1 Bag-of-Words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_df=0.15, stop_words=stopwords, max_features=9000)
word_counts = count_vect.fit_transform(cleaned_text.tolist())
print(word_counts.shape)

(2225, 9000)


In [8]:
sum_words = word_counts.sum(axis=0)
words_freq_bow = [(word, sum_words[0, idx]) for word, idx in count_vect.vocabulary_.items()]
words_freq_bow = sorted(words_freq_bow, key = lambda x: x[1], reverse=True)

In [9]:
print('The highest frequency words:')
words_freq_bow[0:20] 

The highest frequency words:


[('bn', 958),
 ('film', 890),
 ('music', 839),
 ('labour', 804),
 ('election', 662),
 ('party', 630),
 ('games', 622),
 ('england', 618),
 ('blair', 603),
 ('technology', 561),
 ('minister', 561),
 ('public', 557),
 ('mobile', 546),
 ('british', 542),
 ('tv', 525),
 ('european', 525),
 ('players', 524),
 ('sales', 494),
 ('plans', 486),
 ('six', 484)]

In [10]:
count_vect.get_feature_names()

['aaa',
 'aaas',
 'abandoned',
 'abandoning',
 'abba',
 'abbas',
 'abbasi',
 'abbott',
 'abc',
 'aberdeen',
 'abeyie',
 'abiding',
 'abilities',
 'ability',
 'able',
 'abn',
 'abolish',
 'abolishing',
 'abortion',
 'abortionist',
 'abroad',
 'absa',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'abuse',
 'abused',
 'abuses',
 'ac',
 'academic',
 'academy',
 'accelerated',
 'accelerating',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'access',
 'accessed',
 'accessible',
 'accessing',
 'accession',
 'accident',
 'acclaim',
 'acclaimed',
 'accolade',
 'accommodate',
 'accompanied',
 'accompanying',
 'accomplished',
 'according',
 'account',
 'accountability',
 'accountable',
 'accounted',
 'accounting',
 'accounts',
 'accuracy',
 'accurate',
 'accurately',
 'accusations',
 'accuse',
 'accused',
 'accusing',
 'ace',
 'aceh',
 'achieve',
 'achieved',
 'achievement',
 'achievements',
 'achieving',
 'achilles',
 'acknowledge',
 'acknowledged',
 'acquire',
 'acqu

In [87]:
feature_names=np.array(count_vect.get_feature_names())

pd.DataFrame(word_counts.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8990,8991,8992,8993,8994,8995,8996,8997,8998,8999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2223,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
word_counts

<2225x9000 sparse matrix of type '<class 'numpy.int64'>'
	with 237910 stored elements in Compressed Sparse Row format>

### 4.2 TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True, use_idf=True)
word_tfidf = tfidf_transformer.fit_transform(word_counts)

In [14]:
tfidf_transformer.idf_.shape

(9000,)

In [15]:
sorted_by_idf = np.argsort(tfidf_transformer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_idf[:100]]))

Features with lowest idf:
['according' 'six' 'public' 'month' 'day' 'bn' 'based' 'called' 'british'
 'minister' 'hit' 'international' 'move' 'don' 'director' 'help' 'despite'
 'plans' 'half' 'london' 'record' 'european' 'start' 'money' 're' 'seen'
 'future' 'spokesman' 'players' 'business' 'earlier' 'companies' 'britain'
 'run' 'able' 'saying' 'europe' 'biggest' 'times' 'due' 'taking'
 'decision' 'lost' 'team' 'national' 'recent' 'deal' 'england' 'industry'
 'past' 'film' 'life' 'found' 'service' 'final' 'lot' 'labour' 'january'
 'technology' 'music' 'executive' 'days' 'looking' 'look' 'following'
 'games' 'action' 'strong' 'major' 'growth' 'using' 'ahead' 'real'
 'believe' 'michael' 'currently' 'line' 'party' 'hard' 'little'
 'president' 'playing' 'held' 'election' 'december' 'played' 'services'
 'system' 'office' 'current' 'david' 'figures' 'left' 'firms' 've'
 'announced' 'include' 'john' 'cut' 'support']


In [88]:
pd.DataFrame(word_tfidf.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8990,8991,8992,8993,8994,8995,8996,8997,8998,8999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5. Extract keyword with TF-IDF

In [17]:
topn = 10
with_tf_idf = []
no_tf_idf = []


for idx, doc in cleaned_text.items():
    #generate tf-idf for the given document
    tf_idf_vector=word_tfidf[idx]
    
    temp = pd.DataFrame(zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data),columns=['feature_number','tf_idf'])
    temp.sort_values('tf_idf', ascending = False, inplace = True)
    
    #use only topn items from vector
     
    topn_items = temp[:topn]

    tf_idf = []
    word = []

    for index, row in topn_items.iterrows():
        #print(int(row['feature_number']))
        fname = feature_names[int(row['feature_number'])]
        word.append(fname)
        tf_idf.append(round(row['tf_idf'], 3))

    result1 = dict(zip(word, tf_idf))
    result2 = word
    
    with_tf_idf.append(result1)
    no_tf_idf.append(result2)
    

In [18]:
dat['cleaned_text'] = cleaned_text
dat['keywords'] = no_tf_idf
dat['keywords_tfidf'] = with_tf_idf
dat.head(10)

Unnamed: 0,category,text,cleaned_text,keywords,keywords_tfidf
0,tech,tv future in the hands of viewers with home theatre systems plasma high-definition tvs and dig...,tv future in the hands of viewers with home theatre systems plasma high definition tvs and digit...,"[tv, dvr, definition, tivo, watch, satellite, content, brands, brand, programmes]","{'tv': 0.385, 'dvr': 0.266, 'definition': 0.175, 'tivo': 0.171, 'watch': 0.155, 'satellite': 0.1..."
1,business,worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseein...,worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseeing a...,"[worldcom, ebbers, myers, accounting, fraud, defence, witness, bn, collapsed, boss]","{'worldcom': 0.481, 'ebbers': 0.442, 'myers': 0.351, 'accounting': 0.274, 'fraud': 0.14, 'defenc..."
2,sport,tigers wary of farrell gamble leicester say they will not be rushed into making a bid for andy...,tigers wary of farrell gamble leicester say they will not be rushed into making a bid for andy f...,"[farrell, gamble, leicester, tigers, rugby, league, union, knee, andy, involved]","{'farrell': 0.535, 'gamble': 0.321, 'leicester': 0.243, 'tigers': 0.218, 'rugby': 0.215, 'league..."
3,sport,yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier ...,yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier ...,"[league, west, cup, united, drawn, swindon, newcastle, brentford, luton, exeter]","{'league': 0.215, 'west': 0.18, 'cup': 0.158, 'united': 0.154, 'drawn': 0.152, 'swindon': 0.152,..."
4,entertainment,ocean s twelve raids box office ocean s twelve the crime caper sequel starring george clooney ...,ocean s twelve raids box office ocean s twelve the crime caper sequel starring george clooney br...,"[ocean, sequel, twelve, clooney, starring, eleven, pitt, box, roberts, office]","{'ocean': 0.418, 'sequel': 0.363, 'twelve': 0.254, 'clooney': 0.199, 'starring': 0.189, 'eleven'..."
5,politics,howard hits back at mongrel jibe michael howard has said a claim by peter hain that the tory lea...,howard hits back at mongrel jibe michael howard has said a claim by peter hain that the tory lea...,"[howard, mongrel, rattled, hain, party, labour, election, michael, tory, dixon]","{'howard': 0.471, 'mongrel': 0.272, 'rattled': 0.254, 'hain': 0.247, 'party': 0.206, 'labour': 0..."
6,politics,blair prepares to name poll date tony blair is likely to name 5 may as election day when parliam...,blair prepares to name poll date tony blair is likely to name may as election day when parliamen...,"[parliament, election, blair, marr, april, name, bill, queen, announce, commons]","{'parliament': 0.355, 'election': 0.341, 'blair': 0.3, 'marr': 0.223, 'april': 0.208, 'name': 0...."
7,sport,henman hopes ended in dubai third seed tim henman slumped to a straight sets defeat in his rain-...,henman hopes ended in dubai third seed tim henman slumped to a straight sets defeat in his rain ...,"[henman, rain, seed, ljubicic, frustrating, dubai, straight, match, robredo, kiefer]","{'henman': 0.585, 'rain': 0.307, 'seed': 0.245, 'ljubicic': 0.208, 'frustrating': 0.2, 'dubai': ..."
8,sport,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited retur...,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long awaited retur...,"[wilkinson, internationals, injury, edinburgh, england, newcastle, captain, cup, saturday, aggra...","{'wilkinson': 0.456, 'internationals': 0.254, 'injury': 0.23, 'edinburgh': 0.205, 'england': 0.1..."
9,entertainment,last star wars not for children the sixth and final star wars movie may not be suitable for yo...,last star wars not for children the sixth and final star wars movie may not be suitable for youn...,"[wars, lucas, rating, film, star, revenge, suitable, children, chronicles, transformation]","{'wars': 0.409, 'lucas': 0.347, 'rating': 0.318, 'film': 0.29, 'star': 0.245, 'revenge': 0.222, ..."


## 6. Association rule mining

In [19]:
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import association_rules

In [20]:
temp = dat[dat["category"]=="sport"]
temp = temp.loc[0:,"keywords"]
sport_keywords_df = temp.apply(pd.Series)

temp = dat[dat["category"]=="business"]
temp = temp.loc[0:,"keywords"]
business_keywords_df = temp.apply(pd.Series)

temp = dat[dat["category"]=="politics"]
temp = temp.loc[0:,"keywords"]
politics_keywords_df = temp.apply(pd.Series)

temp = dat[dat["category"]=="tech"]
temp = temp.loc[0:,"keywords"]
tech_keywords_df = temp.apply(pd.Series)

temp = dat[dat["category"]=="entertainment"]
temp = temp.loc[0:,"keywords"]
entertainment_keywords_df = temp.apply(pd.Series)

In [21]:
print(sport_keywords_df.shape)
print(business_keywords_df.shape)
print(politics_keywords_df.shape)
print(tech_keywords_df.shape)
print(entertainment_keywords_df.shape)

(511, 10)
(510, 10)
(417, 10)
(401, 10)
(386, 10)


In [22]:
sport_full_list=pd.Series([])
for col in sport_keywords_df:
    sport_full_list = sport_full_list.append(sport_keywords_df[col].dropna())

business_full_list=pd.Series([])
for col in business_keywords_df:
    business_full_list = business_full_list.append(business_keywords_df[col].dropna())

politics_full_list=pd.Series([])
for col in politics_keywords_df:
    politics_full_list = politics_full_list.append(politics_keywords_df[col].dropna())
    
tech_full_list=pd.Series([])
for col in tech_keywords_df:
    tech_full_list = tech_full_list.append(tech_keywords_df[col].dropna())
    
entertainment_full_list=pd.Series([])
for col in entertainment_keywords_df:
    entertainment_full_list = entertainment_full_list.append(entertainment_keywords_df[col].dropna())

  sport_full_list=pd.Series([])
  business_full_list=pd.Series([])
  politics_full_list=pd.Series([])
  tech_full_list=pd.Series([])
  entertainment_full_list=pd.Series([])


In [23]:
print(sport_full_list)

print(business_full_list)

print(politics_full_list)

print(tech_full_list)

print(entertainment_full_list)

2           farrell
3            league
7            henman
8         wilkinson
14          roddick
           ...     
2190       original
2195             am
2209    opportunity
2218       francais
2224          minds
Length: 5110, dtype: object
1         worldcom
11          virgin
12          prices
15              lg
18             ufj
           ...    
2201       concern
2212     expecting
2214    presidency
2219     ownership
2220          rise
Length: 5100, dtype: object
5           howard
6       parliament
13           hague
16            stem
28          clarke
           ...    
2203      academic
2206         prime
2210        polled
2221        deport
2223      prisoner
Length: 4170, dtype: object
0              tv
19       argonaut
20            fbi
21      bandwidth
24       sonaptic
          ...    
2204    developer
2207          pcs
2213      offices
2215     analysis
2217      energis
Length: 4010, dtype: object
4           ocean
9            wars
10         schol

In [24]:
print(sport_full_list.value_counts())

print(business_full_list.value_counts())

print(politics_full_list.value_counts())

print(tech_full_list.value_counts())

print(entertainment_full_list.value_counts())

england       49
wales         37
chelsea       37
cup           36
rugby         34
              ..
drive          1
sealing        1
committing     1
phil           1
poll           1
Length: 1848, dtype: int64
bn            90
economy       51
growth        43
bank          42
shares        39
              ..
martin         1
surplus        1
foster         1
conference     1
online         1
Length: 1990, dtype: int64
labour      83
election    74
blair       72
party       58
brown       50
            ..
mixed        1
future       1
pig          1
voted        1
gibbons      1
Length: 1699, dtype: int64
mobile      44
games       34
software    34
users       32
computer    32
            ..
nm           1
wrong        1
stat         1
webster      1
picture      1
Length: 1520, dtype: int64
film      109
album      33
band       33
awards     32
music      30
         ... 
george      1
blast       1
suits       1
nighty      1
watson      1
Length: 1728, dtype: int64


In [25]:
sport_head = sport_full_list.value_counts().head(50).to_frame()

business_head = business_full_list.value_counts().head(50).to_frame()

politics_head = politics_full_list.value_counts().head(50).to_frame()

tech_head = tech_full_list.value_counts().head(50).to_frame()

entertainment_head = entertainment_full_list.value_counts().head(50).to_frame()

### 6.1 Sports

In [26]:
trans = []
for i in range(0, 511):
    trans.append([str(sport_keywords_df.values[i,j]) for j in range(0, 10)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

(511, 10)


In [27]:
print(trans)

[['farrell' 'gamble' 'leicester' ... 'knee' 'andy' 'involved']
 ['league' 'west' 'cup' ... 'brentford' 'luton' 'exeter']
 ['henman' 'rain' 'seed' ... 'match' 'robredo' 'kiefer']
 ...
 ['friendlies' 'manager' 'players' ... 'international' 'probably'
  'opportunity']
 ['davies' 'ospreys' 'gloucester' ... 'stay' 'summer' 'francais']
 ['souness' 'shearer' 'goal' ... 'delight' 'bearing' 'minds']]


In [28]:
#Transforms the input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

# getting the shape of the data
data_encoded.shape

(511, 1848)

In [29]:
data_encoded

Unnamed: 0,aaa,aaas,abandoned,abbott,aberdeen,abeyie,absent,ac,accelerated,accident,...,yannick,yapp,yards,yelling,york,youth,zambia,zealand,zero,zimbabwe
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
507,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
508,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
509,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
data_encoded = data_encoded.loc[:, sport_head.index]
data_encoded.shape

(511, 50)

In [31]:
data_encoded

Unnamed: 0,england,wales,chelsea,cup,rugby,injury,ireland,match,club,champion,...,thanou,round,spain,half,kenteris,referee,iaaf,barcelona,mourinho,tennis
0,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
507,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
508,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
509,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Apriori Algo (Sports)

In [32]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)

In [33]:
frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)
frequent_itemsets[['support']].describe()

Unnamed: 0,support
count,89.0
mean,0.02975
std,0.017086
min,0.011742
25%,0.015656
50%,0.02544
75%,0.035225
max,0.09589


In [34]:
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules_c.sort_values('confidence', ascending = False, inplace = True)
rules_c

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
36,"(athens, thanou)",(kenteris),0.015656,0.025440,0.015656,1.000000,39.307692,0.015257,inf
63,"(iaaf, athens)","(kenteris, thanou)",0.011742,0.025440,0.011742,1.000000,39.307692,0.011443,inf
47,"(iaaf, athens)",(kenteris),0.011742,0.025440,0.011742,1.000000,39.307692,0.011443,inf
40,"(iaaf, athens)",(thanou),0.011742,0.027397,0.011742,1.000000,36.500000,0.011420,inf
34,"(kenteris, athens)",(thanou),0.015656,0.027397,0.015656,1.000000,36.500000,0.015227,inf
...,...,...,...,...,...,...,...,...,...
1,(france),(england),0.041096,0.095890,0.013699,0.333333,3.476190,0.009758,1.356164
13,(seed),(match),0.046967,0.052838,0.015656,0.333333,6.308642,0.013174,1.420744
3,(ireland),(wales),0.060665,0.072407,0.019569,0.322581,4.455100,0.015177,1.369304
4,(williams),(wales),0.045010,0.072407,0.013699,0.304348,4.203290,0.010440,1.333415


In [35]:
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules_l.sort_values('lift', ascending = False, inplace = True)
rules_l

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
103,(kenteris),"(iaaf, athens, thanou)",0.025440,0.011742,0.011742,0.461538,39.307692,0.011443,1.835337
71,(kenteris),"(athens, thanou)",0.025440,0.015656,0.015656,0.615385,39.307692,0.015257,2.559295
100,"(kenteris, thanou)","(iaaf, athens)",0.025440,0.011742,0.011742,0.461538,39.307692,0.011443,1.835337
84,(kenteris),"(iaaf, athens)",0.025440,0.011742,0.011742,0.461538,39.307692,0.011443,1.835337
70,"(athens, thanou)",(kenteris),0.015656,0.025440,0.015656,1.000000,39.307692,0.015257,inf
...,...,...,...,...,...,...,...,...,...
15,(wales),(rugby),0.072407,0.066536,0.011742,0.162162,2.437202,0.006924,1.114134
7,(injury),(england),0.066536,0.095890,0.013699,0.205882,2.147059,0.007318,1.138508
6,(england),(injury),0.095890,0.066536,0.013699,0.142857,2.147059,0.007318,1.089041
3,(cup),(england),0.070450,0.095890,0.011742,0.166667,1.738095,0.004986,1.084932


In [36]:
rules_c[ (rules_c['lift'] >= 5) &
       (rules_c['confidence'] >= 0.5) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
36,"(athens, thanou)",(kenteris),0.015656,0.02544,0.015656,1.0,39.307692,0.015257,inf
63,"(iaaf, athens)","(kenteris, thanou)",0.011742,0.02544,0.011742,1.0,39.307692,0.011443,inf
47,"(iaaf, athens)",(kenteris),0.011742,0.02544,0.011742,1.0,39.307692,0.011443,inf
40,"(iaaf, athens)",(thanou),0.011742,0.027397,0.011742,1.0,36.5,0.01142,inf
34,"(kenteris, athens)",(thanou),0.015656,0.027397,0.015656,1.0,36.5,0.015227,inf
28,(kenteris),(thanou),0.02544,0.027397,0.02544,1.0,36.5,0.024743,inf
52,"(iaaf, kenteris)",(thanou),0.021526,0.027397,0.021526,1.0,36.5,0.020937,inf
58,"(iaaf, kenteris, athens)",(thanou),0.011742,0.027397,0.011742,1.0,36.5,0.01142,inf
60,"(iaaf, athens, thanou)",(kenteris),0.011742,0.02544,0.011742,1.0,39.307692,0.011443,inf
29,(thanou),(kenteris),0.027397,0.02544,0.02544,0.928571,36.5,0.024743,13.643836


In [37]:
rules_l[ (rules_l['lift'] >= 5) &
       (rules_l['confidence'] >= 0.5) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
71,(kenteris),"(athens, thanou)",0.02544,0.015656,0.015656,0.615385,39.307692,0.015257,2.559295
70,"(athens, thanou)",(kenteris),0.015656,0.02544,0.015656,1.0,39.307692,0.015257,inf
97,"(iaaf, athens)","(kenteris, thanou)",0.011742,0.02544,0.011742,1.0,39.307692,0.011443,inf
81,"(iaaf, athens)",(kenteris),0.011742,0.02544,0.011742,1.0,39.307692,0.011443,inf
94,"(iaaf, athens, thanou)",(kenteris),0.011742,0.02544,0.011742,1.0,39.307692,0.011443,inf
91,(thanou),"(iaaf, kenteris)",0.027397,0.021526,0.021526,0.785714,36.5,0.020937,4.56621
73,(thanou),"(kenteris, athens)",0.027397,0.015656,0.015656,0.571429,36.5,0.015227,2.296804
74,"(iaaf, athens)",(thanou),0.011742,0.027397,0.011742,1.0,36.5,0.01142,inf
63,(thanou),(kenteris),0.027397,0.02544,0.02544,0.928571,36.5,0.024743,13.643836
62,(kenteris),(thanou),0.02544,0.027397,0.02544,1.0,36.5,0.024743,inf


### 6.2 Business

In [38]:
trans = []
for i in range(0, 510):
    trans.append([str(business_keywords_df.values[i,j]) for j in range(0, 10)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

(510, 10)


In [39]:
print(trans)

[['worldcom' 'ebbers' 'myers' ... 'bn' 'collapsed' 'boss']
 ['virgin' 'blue' 'profits' ... 'reported' 'qantas' 'november']
 ['prices' 'crude' 'oil' ... 'cut' 'temperatures' 'stocks']
 ...
 ['bush' 'budget' 'spending' ... 'administration' 'cuts' 'presidency']
 ['parking' 'wang' 'car' ... 'outdoor' 'parks' 'ownership']
 ['sales' 'retail' 'stores' ... 'clothing' 'spending' 'rise']]


In [40]:
#Transforms the input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

# getting the shape of the data
data_encoded.shape

(510, 1990)

In [41]:
data_encoded

Unnamed: 0,abn,absa,accept,access,account,accounted,accounting,accounts,accused,acquisition,...,york,yorkshire,youth,yuan,yugansk,yuganskneftegas,yuganskneftegaz,yukos,yushchenko,zone
0,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
506,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
507,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
508,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [42]:
data_encoded = data_encoded.loc[:, business_head.index]
data_encoded.shape

(510, 50)

In [43]:
data_encoded

Unnamed: 0,bn,economy,growth,bank,shares,sales,oil,economic,prices,china,...,price,debt,president,profit,stake,jobs,airline,retail,budget,december
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
2,False,False,False,False,False,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
506,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
507,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
508,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Apriori Algo (Business)

In [44]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)

In [45]:
frequent_itemsets[['support']].describe()

Unnamed: 0,support
count,99.0
mean,0.029788
std,0.023218
min,0.011765
25%,0.015686
50%,0.02549
75%,0.034314
max,0.176471


In [46]:
rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules_c.sort_values('confidence', ascending = False, inplace = True)

In [47]:
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules_l.sort_values('lift', ascending = False, inplace = True)

In [48]:
rules_c[ (rules_c['lift'] >= 2) &
       (rules_c['confidence'] >= 0.5) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
56,"(yukos, court)",(russian),0.013725,0.043137,0.013725,1.0,23.181818,0.013133,inf
58,"(court, russian)",(yukos),0.013725,0.047059,0.013725,1.0,21.25,0.01308,inf
45,"(bn, russian)",(yukos),0.011765,0.047059,0.011765,1.0,21.25,0.011211,inf
53,"(oil, russian)",(yukos),0.017647,0.047059,0.015686,0.888889,18.888889,0.014856,8.576471
28,(russian),(yukos),0.043137,0.047059,0.037255,0.863636,18.352273,0.035225,6.988235
44,"(bn, yukos)",(russian),0.013725,0.043137,0.011765,0.857143,19.87013,0.011173,6.698039
27,(yukos),(russian),0.047059,0.043137,0.037255,0.791667,18.352273,0.035225,4.592941
51,"(yukos, oil)",(russian),0.021569,0.043137,0.015686,0.727273,16.859504,0.014756,3.508497
49,"(quarter, growth)",(economy),0.019608,0.1,0.013725,0.7,7.0,0.011765,3.0
47,"(economy, quarter)",(growth),0.019608,0.084314,0.013725,0.7,8.302326,0.012072,3.052288


In [49]:
rules_l[ (rules_l['lift'] >= 2) &
       (rules_l['confidence'] >= 0.5) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
106,"(yukos, court)",(russian),0.013725,0.043137,0.013725,1.0,23.181818,0.013133,inf
108,"(court, russian)",(yukos),0.013725,0.047059,0.013725,1.0,21.25,0.01308,inf
89,"(bn, russian)",(yukos),0.011765,0.047059,0.011765,1.0,21.25,0.011211,inf
88,"(bn, yukos)",(russian),0.013725,0.043137,0.011765,0.857143,19.87013,0.011173,6.698039
102,"(oil, russian)",(yukos),0.017647,0.047059,0.015686,0.888889,18.888889,0.014856,8.576471
70,(yukos),(russian),0.047059,0.043137,0.037255,0.791667,18.352273,0.035225,4.592941
71,(russian),(yukos),0.043137,0.047059,0.037255,0.863636,18.352273,0.035225,6.988235
100,"(yukos, oil)",(russian),0.021569,0.043137,0.015686,0.727273,16.859504,0.014756,3.508497
86,(budget),(deficit),0.023529,0.033333,0.011765,0.5,15.0,0.01098,1.933333
81,(euro),(dollar),0.027451,0.039216,0.015686,0.571429,14.571429,0.01461,2.24183


In [51]:
### 6.3 Politics

trans = []
for i in range(0, 417):
    trans.append([str(politics_keywords_df.values[i,j]) for j in range(0, 10)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

print(trans)

te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, politics_head.index]

### Apriori Algo (Politics)
frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)

frequent_itemsets[['support']].describe()

rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules_c.sort_values('confidence', ascending = False, inplace = True)

rules_c[ (rules_c['lift'] >= 5) &
       (rules_c['confidence'] >= 1) ]

(417, 10)
[['howard' 'mongrel' 'rattled' ... 'michael' 'tory' 'dixon']
 ['parliament' 'election' 'blair' ... 'queen' 'announce' 'commons']
 ['hague' 'ambition' 'party' ... 'leadership' 'writing' 'politics']
 ...
 ['pupils' 'survey' 'little' ... 'looked' 'foods' 'polled']
 ['kilroy' 'asylum' 'silk' ... 'argued' 'policy' 'deport']
 ['confess' 'prisoners' 'arms' ... 'stop' 'theory' 'prisoner']]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
246,"(blair, minister)",(prime),0.028777,0.062350,0.028777,1.0,16.038462,0.026983,inf
649,"(blair, chancellor, minister)","(prime, brown)",0.014388,0.031175,0.014388,1.0,32.076923,0.013940,inf
151,"(mps, brown)",(labour),0.011990,0.199041,0.011990,1.0,5.024096,0.009604,inf
475,"(prime, blair, chancellor)",(brown),0.014388,0.119904,0.014388,1.0,8.340000,0.012663,inf
467,"(tax, tory, howard)",(election),0.011990,0.177458,0.011990,1.0,5.635135,0.009863,inf
...,...,...,...,...,...,...,...,...,...
204,"(budget, election)",(brown),0.011990,0.119904,0.011990,1.0,8.340000,0.010553,inf
665,"(blair, labour, brown, prime, chancellor)",(minister),0.011990,0.045564,0.011990,1.0,21.947368,0.011444,inf
664,"(blair, labour, brown, minister, chancellor)",(prime),0.011990,0.062350,0.011990,1.0,16.038462,0.011243,inf
653,"(prime, chancellor, brown)","(blair, minister)",0.014388,0.028777,0.014388,1.0,34.750000,0.013974,inf


In [52]:
### 6.4 Tech

trans = []
for i in range(0, 401):
    trans.append([str(tech_keywords_df.values[i,j]) for j in range(0, 10)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

print(trans)

te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, tech_head.index]

### Apriori Algo (Tech)
frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)

frequent_itemsets[['support']].describe()

rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules_c.sort_values('confidence', ascending = False, inplace = True)

rules_c[ (rules_c['lift'] >= 5) &
       (rules_c['confidence'] >= 1) ]

(401, 10)
[['tv' 'dvr' 'definition' ... 'brands' 'brand' 'programmes']
 ['argonaut' 'games' 'cash' ... 'administrator' 'administrators' 'staff']
 ['fbi' 'mails' 'attachment' ... 'computer' 'contains' 'internet']
 ...
 ['domains' 'icann' 'domain' ... 'post' 'postal' 'offices']
 ['email' 'spam' 'systems' ... 'junk' 'motivated' 'analysis']
 ['attacks' 'traffic' 'sites' ... 'attack' 'capella' 'energis']]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
142,"(nintendo, sony, handheld)",(psp),0.017456,0.029925,0.017456,1.0,33.416667,0.016934,inf
129,"(gaming, nintendo)",(handheld),0.014963,0.032419,0.014963,1.0,30.846154,0.014478,inf
103,"(computer, machine)",(apple),0.014963,0.052369,0.014963,1.0,19.095238,0.014179,inf
135,"(nintendo, psp)",(handheld),0.01995,0.032419,0.01995,1.0,30.846154,0.019303,inf
104,"(computer, apple)",(machine),0.014963,0.027431,0.014963,1.0,36.454545,0.014552,inf
105,"(machine, apple)",(computer),0.014963,0.0798,0.014963,1.0,12.53125,0.013769,inf
144,"(sony, psp, handheld)",(nintendo),0.017456,0.0399,0.017456,1.0,25.0625,0.01676,inf
130,"(gaming, handheld)",(nintendo),0.014963,0.0399,0.014963,1.0,25.0625,0.014366,inf
73,"(phone, mobiles)",(mobile),0.012469,0.109726,0.012469,1.0,9.113636,0.011101,inf
137,"(psp, handheld)",(nintendo),0.01995,0.0399,0.01995,1.0,25.0625,0.019154,inf


In [53]:
### 6.5. Entertainment

trans = []
for i in range(0, 386):
    trans.append([str(entertainment_keywords_df.values[i,j]) for j in range(0, 10)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

print(trans)

te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

data_encoded = data_encoded.loc[:, entertainment_head.index]

### Apriori Algo (Entertainment)
frequent_itemsets=apriori(data_encoded, min_support = 0.01, use_colnames = True)

frequent_itemsets[['support']].describe()

rules_c = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules_c.sort_values('confidence', ascending = False, inplace = True)

rules_c[ (rules_c['lift'] >= 5) &
       (rules_c['confidence'] >= 0.5) ]

(386, 10)
[['ocean' 'sequel' 'twelve' ... 'box' 'roberts' 'office']
 ['wars' 'lucas' 'rating' ... 'children' 'chronicles' 'transformation']
 ['scholl' 'film' 'carmen' ... 'hitler' 'resistance' 'distributing']
 ...
 ['lopez' 'cancelled' 'film' ... 'charity' 'london' 'actress']
 ['tsunami' 'tv' 'clooney' ... 'viewers' 'funds' 're']
 ['glasgow' 'rem' 'concert' ... 'park' 'bought' 'fans']]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
90,"(oscar, office)",(box),0.010363,0.049223,0.010363,1.0,20.315789,0.009853,inf
34,(berlin),(festival),0.020725,0.069948,0.020725,1.0,14.296296,0.019276,inf
88,"(awards, academy)",(oscar),0.010363,0.062176,0.010363,1.0,16.083333,0.009718,inf
78,"(film, ray)",(foxx),0.015544,0.023316,0.015544,1.0,42.888889,0.015182,inf
74,"(office, film)",(box),0.012953,0.049223,0.012953,1.0,20.315789,0.012316,inf
69,"(berlin, film)",(festival),0.015544,0.069948,0.015544,1.0,14.296296,0.014457,inf
98,"(singles, charts)",(chart),0.010363,0.046632,0.010363,1.0,21.444444,0.009879,inf
100,"(charts, chart)",(singles),0.010363,0.031088,0.010363,1.0,32.166667,0.010041,inf
103,"(elvis, singles)",(chart),0.010363,0.046632,0.010363,1.0,21.444444,0.009879,inf
45,(office),(box),0.033679,0.049223,0.031088,0.923077,18.753036,0.02943,12.360104


## 7. Classification Model

### 7.1 Logistic Regression with Word Count

In [54]:
X = word_counts.toarray()
y = dat['category'].map( {'sport': 0, 'business': 1, 'politics': 2, 'tech': 3, 'entertainment': 4} ).astype(int)

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [56]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(solver='lbfgs',max_iter=3000), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.97


In [57]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(solver='lbfgs',max_iter=3000), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross-validation score: 0.97
Best parameters:  {'C': 0.1}


In [58]:
print("Test score: {:.2f}".format(grid.score(X_test, y_test)))

Test score: 0.98


### 7.2 Logistic Regression with TF-IDF

In [59]:
X2 = word_tfidf.toarray()
y = dat['category'].map( {'sport': 0, 'business': 1, 'politics': 2, 'tech': 3, 'entertainment': 4} ).astype(int)

In [60]:
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state=2)

In [61]:
scores = cross_val_score(LogisticRegression(solver='lbfgs',max_iter=3000), X2_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.97


In [62]:
param_grid = {'C':[10, 50, 100, 150, 200]}
grid = GridSearchCV(LogisticRegression(solver='lbfgs',max_iter=3000), param_grid, cv=5)
grid.fit(X2_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross-validation score: 0.98
Best parameters:  {'C': 50}


In [63]:
print("Test score: {:.2f}".format(grid.score(X2_test, y_test)))

Test score: 0.98


### 7.3 Random Forest Classifier with Word Count

In [64]:
from sklearn.ensemble import RandomForestClassifier

X3 = word_counts.toarray()
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=0.3, random_state=2)

In [65]:
rf = RandomForestClassifier()
scores = cross_val_score(rf, X3_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.96


In [66]:
param_grid = {'n_estimators':[100, 200, 300],
             'max_depth':[10, 20, 30, None]}
grid = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid.fit(X3_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)


Best cross-validation score: 0.96
Best parameters:  {'max_depth': 30, 'n_estimators': 100}


In [67]:
print("Test score: {:.2f}".format(grid.score(X3_test, y_test)))

Test score: 0.96


### 7.4 Random Forest Classifier with TF-IDF

In [69]:
from sklearn.ensemble import RandomForestClassifier

X4 = word_tfidf.toarray()
X4_train, X4_test, y_train, y_test = train_test_split(X4, y, test_size=0.3, random_state=2)

In [70]:
rf = RandomForestClassifier()
scores = cross_val_score(rf, X4_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.96


In [71]:
param_grid = {'n_estimators':[10,50,100],
             'max_depth':[10,20,30,None]}
grid = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid.fit(X4_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)


Best cross-validation score: 0.96
Best parameters:  {'max_depth': None, 'n_estimators': 50}


In [72]:
print("Test score: {:.2f}".format(grid.score(X4_test, y_test)))

Test score: 0.96


## 8. Further Improvements

In [82]:
def clean_text(text):
    
    text=text.lower()
    
    text=re.sub("</?.*?>"," <> ",text)
    
    text=re.sub("(\\d|\\W|_)+"," ",text)
    
    tokens = re.split("\W+", text)
    
    text = [word for word in tokens if word not in stopwords]
    
    text = [wn.lemmatize(word) for word in text ]
    
    return text

count_vect = CountVectorizer(analyzer=clean_text)
word_counts2 = count_vect.fit_transform(cleaned_text.tolist())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24532,24533,24534,24535,24536,24537,24538,24539,24540,24541
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 8.1 Logistic Regression with Word Count (lemmatized)

In [75]:
X = word_counts2.toarray()
y = dat['category'].map( {'sport': 0, 'business': 1, 'politics': 2, 'tech': 3, 'entertainment': 4} ).astype(int)

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [77]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(solver='lbfgs',max_iter=3000), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.97


In [78]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(solver='lbfgs',max_iter=3000), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross-validation score: 0.97
Best parameters:  {'C': 0.01}


In [79]:
print("Test score: {:.2f}".format(grid.score(X_test, y_test)))

Test score: 0.98


### 8.2 Logistic Regression with TF-IDF (lemmatized)

In [89]:
from sklearn.ensemble import RandomForestClassifier

X3 = word_counts2.toarray()
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=0.3, random_state=2)

In [84]:
rf = RandomForestClassifier()
scores = cross_val_score(rf, X3_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.96


In [85]:
param_grid = {'n_estimators':[10,50,100],
             'max_depth':[10,20,30,None]}
grid = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid.fit(X3_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)


Best cross-validation score: 0.96
Best parameters:  {'max_depth': 30, 'n_estimators': 100}


In [86]:
print("Test score: {:.2f}".format(grid.score(X3_test, y_test)))

Test score: 0.97
