In [139]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import string
from wordcloud import WordCloud
from spacy.lang.en import STOP_WORDS
import glob

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report  ,confusion_matrix , accuracy_score
from sklearn.pipeline import Pipeline


In [102]:
nlp = spacy.load('en_core_web_lg')
#stop words

stopwords = list(STOP_WORDS)
punct = string.punctuation

## Read the files

In [119]:


#first name the categories
category = ['business' , 'entertainment' , 'politics' , 'sport' , 'tech']

In [120]:
data = []

#read the files and save it to the list
for i,filename in enumerate(category):
    path = f'bbc/{filename}/*.txt'
    files = glob.glob(path)
    for j , file in enumerate(files):
        with open(file  ,'r') as f:
            data.append((f.read() , filename))

In [121]:
#save the text to dataframe

df = pd.DataFrame(data , columns= ['Text' , 'Category'])

In [122]:
df.head()

Unnamed: 0,Text,Category
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [123]:
df['Category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: Category, dtype: int64

# train test split

In [124]:
X_train , X_test  ,y_train  ,y_test = train_test_split(df['Text'] , df['Category']  ,test_size=0.3,shuffle=True , random_state=42 )

In [125]:
X_train.head() , X_train.shape

(1617    Kirwan demands Italy consistency\n\nItaly coac...
 1706    Tindall wants second opinion\n\nEngland centre...
 1639    Sella wants Michalak recall\n\nFormer France c...
 1221    Prime minister's questions\n\nSo who, if anyon...
 680     US 'to raise TV indecency fines'\n\nUS politic...
 Name: Text, dtype: object, (1557,))

In [126]:
X_test.head() , X_test.shape

(414     UK house prices dip in November\n\nUK house pr...
 420     LSE 'sets date for takeover deal'\n\nThe Londo...
 1644    Harinordoquy suffers France axe\n\nNumber eigh...
 416     Barclays shares up on merger talk\n\nShares in...
 1232    Campaign 'cold calls' questioned\n\nLabour and...
 Name: Text, dtype: object, (668,))

# Now change into tokens

In [127]:
def change_into_tokens(text):
    doc = nlp(text)
    tokens = []
    cleaned_text = []
    
    for token in doc:
        if token.lemma_ != '-PRON-':
            tokens.append(token.lemma_.lower().strip())
        else:
            tokens.append(token.lower_)
            
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_text.append(token)
    
    return cleaned_text

In [132]:
#check 

change_into_tokens('HEY THIS IS REAL')

['hey', 'real']

# now create the model

In [133]:
linear_model  =LogisticRegression()
tfidf = TfidfVectorizer(tokenizer=change_into_tokens)

In [134]:
clf = Pipeline([('tfidf' , tfidf) , ('linear_model'  , linear_model)])

In [135]:
clf.fit(X_train  ,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                                 tokenizer=<function change_into_tokens at 0x00000205322BFDC8>,
                                 use_idf=True, vocabulary=None)),
                ('linear_model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 

In [136]:
y_pred = clf.predict(X_test)

In [137]:
print(classification_report(y_test , y_pred))

               precision    recall  f1-score   support

     business       0.96      0.97      0.96       165
entertainment       0.99      0.97      0.98       118
     politics       0.97      0.97      0.97       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.96      0.97       125

     accuracy                           0.97       668
    macro avg       0.97      0.97      0.97       668
 weighted avg       0.97      0.97      0.97       668



In [138]:
print(confusion_matrix(y_test , y_pred))

[[160   0   3   1   1]
 [  1 115   1   0   1]
 [  2   0 116   1   1]
 [  1   0   0 139   0]
 [  3   1   0   1 120]]


In [140]:
print(accuracy_score(y_test , y_pred))

0.9730538922155688


In [143]:

#wow 97 percent accurate