# Basic Text Mining and Document Classification
(by Tevfik Aytekin)

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
# You need to call nltk.download() to download all the nltk corpora

In [None]:
gutenberg.fileids()

In [None]:
gutenberg.raw('austen-emma.txt')

In [None]:
print("num_chars: ", len(gutenberg.raw('austen-emma.txt')))
print("num_words: ", len(gutenberg.words('austen-emma.txt')))
print("num_sents: ", len(gutenberg.sents('austen-emma.txt')))

### Frequency Distribution

In [None]:
words = gutenberg.words('austen-emma.txt')
dist = FreqDist(words)

In [None]:
dist

In [None]:
freqwords = [w for w in dist.keys() if len(w) > 5 and dist[w] > 50]
freqwords[:5]

### Stemming

The stemming process is defined in the [Porter stemmer web page](https://tartarus.org/martin/PorterStemmer/) as follows:

The Porter stemming algorithm (or ‘Porter stemmer’) is a process for removing the commoner morphological and inflexional endings from words in English. Its main use is as part of a term normalisation process that is usually done when setting up Information Retrieval systems.

In [None]:
text1 = "consulting consultant consultants consultings"
words1 = text1.lower().split(' ')
words1

In [None]:
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

In [None]:
text2 = "universal University universe"
words2 = text2.lower().split(' ')
[porter.stem(t) for t in words2]

Note that the stemming process not always returns a whole meaningful word. This is because the aim of stemming is to increase the effectiveness of an information retrieval system and the success of a stemmer should be evaluated with respect to the effectiveness of the IR systems. If you want to get meaningful words then you should use lemmatization.

### Lemmatization

In [None]:
wn_lemma = nltk.WordNetLemmatizer()
text = "plays caring bats"
words = text.lower().split(' ')
[wn_lemma.lemmatize(t) for t in words]

In [None]:
wn_lemma.lemmatize('caring','v')

In [None]:
[wn_lemma.lemmatize(t) for t in words2]

### Part-of-speech (POS) Tagging

In [None]:
text = "Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence"
print(text)
words = nltk.word_tokenize(text)
nltk.pos_tag(words)

## Document Classification

In [None]:
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
stop_words = stopwords.words("english")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("../datasets/bbc-text.csv")
# you can download the dataset from here: https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv

In [None]:
data.head(10)

In [None]:
data.iloc[0,1]

In [None]:
data['category'].value_counts()

### Vector space model

#### CountVectorizer


In [None]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 20, max_df=0.2)
count_model = vectorizer.fit(data["text"])
X = count_model.transform(data["text"])

In [None]:
count_model.get_feature_names_out()

In [None]:
X.todense()[:5]

In [None]:
X.shape

#### TfidfVectorizer


In [None]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 10, max_df=0.3)
tfidf_model = vectorizer.fit(data["text"])
X = tfidf_model.transform(data["text"])

In [None]:
tfidf_model.get_feature_names_out()

In [None]:
import numpy as np
X.todense()[:5]

### Full model

In [136]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

vectorizer = TfidfVectorizer(analyzer = "word", max_features = 1000)
tfidf_model = vectorizer.fit(data["text"])
pickle.dump(tfidf_model, open("../datasets/tfidf.pkl", "wb"))
X = tfidf_model.transform(data["text"])
X_train,X_test,y_train,y_test = train_test_split(X,data["category"],test_size = 0.1)
#clf = OneVsRestClassifier(LogisticRegression())
clf = GradientBoostingClassifier()
clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)
pickle.dump(clf, open("../datasets/text_clf.pkl", 'wb'))
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

               precision    recall  f1-score   support

     business       0.83      0.80      0.81        49
entertainment       0.82      0.84      0.83        32
     politics       0.84      0.73      0.78        44
        sport       0.93      0.96      0.95        55
         tech       0.79      0.88      0.84        43

     accuracy                           0.85       223
    macro avg       0.84      0.84      0.84       223
 weighted avg       0.85      0.85      0.85       223

[[39  2  2  1  5]
 [ 2 27  0  2  1]
 [ 4  3 32  1  4]
 [ 0  0  2 53  0]
 [ 2  1  2  0 38]]


In [138]:
tf_model = pickle.load(open("../datasets/tfidf.pkl", 'rb'))

new_data = ["The Betis substitute broke clear and coolly slide \
            the ball past goalkeeper Thibaut Courtois as the hosts \
            picked up only their eighth victory of the season to move \
            up to 12th, eight points clear of the relegation zone."]
X = tfidf_model.transform(new_data)
loaded_clf = pickle.load(open("../datasets/text_clf.pkl", 'rb'))
preds = loaded_clf.predict_proba(X)
print(loaded_clf.classes_)
print(preds)

['business' 'entertainment' 'politics' 'sport' 'tech']
['sport']


In [135]:
X.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.07247404, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0