# Basic Text Mining and Document Classification
(by Tevfik Aytekin)

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
# You need to call nltk.download() to download all the nltk corpora

In [None]:
nltk.download('gutenberg')

gutenberg.fileids()

In [None]:
print(gutenberg.raw('austen-emma.txt'))

In [None]:
print("num_chars: ", len(gutenberg.raw('austen-emma.txt')))
print("num_words: ", len(gutenberg.words('austen-emma.txt')))
print("num_sents: ", len(gutenberg.sents('austen-emma.txt')))

### Frequency Distribution

In [None]:
words = gutenberg.words('austen-emma.txt')
dist = FreqDist(words)

In [None]:
dist

In [None]:
freqwords = [w for w in dist.keys() if len(w) > 5 and dist[w] > 50]
freqwords[:5]

### Stemming

The stemming process is defined in the [Porter stemmer web page](https://tartarus.org/martin/PorterStemmer/) as follows:

The Porter stemming algorithm (or ‘Porter stemmer’) is a process for removing the commoner morphological and inflexional endings from words in English. Its main use is as part of a term normalisation process that is usually done when setting up Information Retrieval systems.

In [9]:
text1 = "consulting consultant consultants consultings"
words1 = text1.lower().split(' ')
words1

['consulting', 'consultant', 'consultants', 'consultings']

In [10]:
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

['consult', 'consult', 'consult', 'consult']

In [11]:
text2 = "universal University universe"
words2 = text2.lower().split(' ')
[porter.stem(t) for t in words2]

['univers', 'univers', 'univers']

Note that the stemming process not always returns a whole meaningful word. This is because the aim of stemming is to increase the effectiveness of an information retrieval system and the success of a stemmer should be evaluated with respect to the effectiveness of the IR systems. If you want to get meaningful words then you should use lemmatization.

### Lemmatization

In [12]:
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Error loading omw-1.4: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [13]:
wn_lemma = nltk.WordNetLemmatizer()
text = "plays caring bats"
words = text.lower().split(' ')
[wn_lemma.lemmatize(t) for t in words]

['play', 'caring', 'bat']

In [14]:
text2 = "plays caring bats"
words2 = text2.lower().split(' ')
[porter.stem(t) for t in words2]

['play', 'care', 'bat']

In [15]:
wn_lemma.lemmatize('caring','v')

'care'

### Part-of-speech (POS) Tagging

In [16]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/tevfik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tevfik/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
text = "Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence"
print(text)
words = nltk.word_tokenize(text)
nltk.pos_tag(words)

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence


[('Emma', 'NNP'),
 ('Woodhouse', 'NNP'),
 (',', ','),
 ('handsome', 'NN'),
 (',', ','),
 ('clever', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('rich', 'JJ'),
 (',', ','),
 ('with', 'IN'),
 ('a', 'DT'),
 ('comfortable', 'JJ'),
 ('home', 'NN'),
 ('and', 'CC'),
 ('happy', 'JJ'),
 ('disposition', 'NN'),
 (',', ','),
 ('seemed', 'VBD'),
 ('to', 'TO'),
 ('unite', 'VB'),
 ('some', 'DT'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('blessings', 'NNS'),
 ('of', 'IN'),
 ('existence', 'NN')]

## Document Classification

In [None]:
nltk.download('stopwords')

In [18]:
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
stop_words = stopwords.words("english")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [19]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("../datasets/bbc-text.csv")
# you can download the dataset from here: https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv

In [20]:
data.head(10)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


In [21]:
data.shape

(2225, 2)

In [22]:
import textwrap
print(textwrap.fill(data.iloc[0,1], 50))

tv future in the hands of viewers with home
theatre systems  plasma high-definition tvs  and
digital video recorders moving into the living
room  the way people watch tv will be radically
different in five years  time.  that is according
to an expert panel which gathered at the annual
consumer electronics show in las vegas to discuss
how these new technologies will impact one of our
favourite pastimes. with the us leading the trend
programmes and other content will be delivered to
viewers via home networks  through cable
satellite  telecoms companies  and broadband
service providers to front rooms and portable
devices.  one of the most talked-about
technologies of ces has been digital and personal
video recorders (dvr and pvr). these set-top boxes
like the us s tivo and the uk s sky+ system  allow
people to record  store  play  pause and forward
wind tv programmes when they want.  essentially
the technology allows for much more personalised
tv. they are also being built-in to high-
def

In [23]:
data['category'].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

### Vector space model

#### CountVectorizer


In [None]:
import numpy as np

In [30]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 10, max_df=0.1)
count_model = vectorizer.fit(data["text"])
X = count_model.transform(data["text"])

In [31]:
pd.DataFrame(data=X.todense(), columns=count_model.get_feature_names_out())

Unnamed: 0,blair,brown,economy,election,mobile,net,sales,services,tax,tv
0,0,0,0,0,1,0,0,1,0,13
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2220,0,0,0,0,0,0,15,0,0,0
2221,0,0,0,0,0,0,0,0,0,0
2222,0,0,0,0,0,0,0,0,0,0
2223,3,3,0,0,0,0,0,0,0,0


#### TfidfVectorizer
[more info](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)

In [32]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
tfidf_model = vectorizer.fit(data["text"])
X = tfidf_model.transform(data["text"])

In [33]:
pd.DataFrame(data=X.todense(), columns=tfidf_model.get_feature_names_out())

Unnamed: 0,against,best,do,government,only,she,them,uk,what,you
0,0.000000,0.000000,0.109351,0.0,0.104112,0.0,0.543219,0.342281,0.742449,0.117168
1,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,1.000000
2,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,0.972785,0.000000,0.000000,0.0,0.231711,0.0,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
2220,0.000000,0.000000,0.000000,0.0,0.700507,0.0,0.000000,0.000000,0.713645,0.000000
2221,0.000000,0.000000,0.000000,0.0,0.876978,0.0,0.000000,0.480531,0.000000,0.000000
2222,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,1.000000,0.000000
2223,0.000000,0.171421,0.072521,0.0,0.207139,0.0,0.432312,0.000000,0.070341,0.854756


In [None]:
idf = 2225/100
tf = 5
np.log(idf)
tfidf = tf * idf
tfidf

### Full model

In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

vectorizer = TfidfVectorizer(analyzer = "word", max_features = 1000)
tfidf_model = vectorizer.fit(data["text"])
pickle.dump(tfidf_model, open("../datasets/tfidf.pkl", "wb"))
X = tfidf_model.transform(data["text"])
X_train,X_test,y_train,y_test = train_test_split(X,data["category"],test_size = 0.1)
#clf = OneVsRestClassifier(LogisticRegression())
#clf = GradientBoostingClassifier()
clf = RandomForestClassifier()
#clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)
pickle.dump(clf, open("../datasets/text_clf.pkl", 'wb'))
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

               precision    recall  f1-score   support

     business       0.95      0.95      0.95        58
entertainment       0.97      0.86      0.91        37
     politics       0.91      0.96      0.93        45
        sport       0.88      0.98      0.93        47
         tech       1.00      0.92      0.96        36

     accuracy                           0.94       223
    macro avg       0.94      0.93      0.94       223
 weighted avg       0.94      0.94      0.94       223

[[55  0  2  1  0]
 [ 1 32  2  2  0]
 [ 0  0 43  2  0]
 [ 1  0  0 46  0]
 [ 1  1  0  1 33]]


In [36]:
tf_model = pickle.load(open("../datasets/tfidf.pkl", 'rb'))

new_data = ["The Betis substitute broke clear and coolly slide \
            the ball past goalkeeper Thibaut Courtois as the hosts \
            picked up only their eighth victory of the season to move \
            up to 12th, eight points clear of the relegation zone."]
X = tfidf_model.transform(new_data)
loaded_clf = pickle.load(open("../datasets/text_clf.pkl", 'rb'))
preds = loaded_clf.predict_proba(X)
print(loaded_clf.classes_)
print(preds)

['business' 'entertainment' 'politics' 'sport' 'tech']
[[0.13 0.18 0.06 0.58 0.05]]
