# Text Classification #

### 20newsgroups dataset ###
20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.

In [1]:
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train', shuffle=True)
train.target_names #prints all the categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [2]:
print('Train set size: %s ' % train.target.size)

Train set size: 11314 


In [3]:
print('FIRST TEXT CATEGORY: %s \n\n' % train.target_names[train.target[0]])
print('FIRST TEXT: \n')
print('\n'.join(train.data[0].split("\n")[:10])) 

FIRST TEXT CATEGORY: rec.autos 


FIRST TEXT: 

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 


# 1. Bag of Words  - data representation #

### Vectorization ###

In [5]:
import numpy as np
np.set_printoptions(precision=2)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [7]:
corpus = [
    'aaa aaa aaa aaa aaa bbb',
    'bbb bbb bbb bbb bbb bbb',
    'bbb ccc',
   ]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
X.toarray()

['aaa' 'bbb' 'ccc']


array([[5, 1, 0],
       [0, 6, 0],
       [0, 1, 1]])

### TF-IDF (TF – term frequency, IDF – inverse document frequency) ###

In [8]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X)
X_train_tfidf.toarray()

array([[0.99, 0.12, 0.  ],
       [0.  , 1.  , 0.  ],
       [0.  , 0.51, 0.86]])

# 2. Fitting a model, Pipeline #

In [9]:
# Vectorization
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train.data)
X_train_counts.shape

(11314, 130107)

In [9]:
# Converting to TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [10]:
# Using DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
# dtc = DecisionTreeClassifier().fit(X_train_tfidf, train.target)


### Pipeline ###

In [13]:
# We can write less code and do all of the above, by building a pipeline.
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary.
# The purpose of the pipeline is to assemble several steps that can be
# cross-validated together while setting different parameters.

from sklearn.pipeline import Pipeline

pipe_clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('dtc', DecisionTreeClassifier())
])

# Now we can use orginal dataset train.data
pipe_clf = pipe_clf.fit(train.data, train.target)

In [14]:
# Performance of DecisionTreeClassifier
test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = pipe_clf.predict(test.data)
np.mean(predicted == test.target)

# is the result realy bad?

0.5566914498141264

### Grid search ###

In [15]:
# Create a list of parameters and their values to be checked.
# All the parameters name are of the form 'stepName__paramName'.
# E.g. 'vect__ngram_range': [(1, 1), (1, 2)]
# that means use unigram and bigrams and choose the one which is optimal.

parameters = {
    'vect__ngram_range': [(1, 1),(1, 2)],  
    'tfidf__use_idf': (True, False)
#     'dtc__max_depth': (20,40)
}

In [16]:
#BELOW COMMANDS ARE TIME EXPENSIVE!

# n_jobs=-1 means using all cores
# Perheps you may need to run "conda install -c anaconda joblib" 

from sklearn.model_selection import GridSearchCV

gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1)

# Run the grid search on the pipeline
gs_clf = gs_clf.fit(train.data, train.target)
print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 

Best score: 0.6380587961568351
Best param: {'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


# 3. NLTK - Natural Language Toolkit #

### Stop words ###

In [35]:
# # Removing stop words with CountVectorizer
# text_clf = Pipeline([
#     ('vect', CountVectorizer(stop_words='english')), 
#     ('tfidf', TfidfTransformer()), 
#     ('clf', DecisionTreeClassifier())
# ])

In [18]:
!pip install nltk
import nltk
nltk.download('snowball_data')
nltk.download('stopwords')

from nltk.corpus import stopwords
print(stopwords.words('english'))

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m435.6 kB/s[0m eta [36m0:00:00[0m1m427.9 kB/s[0m eta [36m0:00:01[0m
[?25hCollecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m792.2 kB/s[0m eta [36m0:00:00[0m1m752.2 kB/s[0m eta [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.5/770.5 KB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.8.1 regex-2022.10.31 tqdm-4.64.1
You should consider upgradin

[nltk_data] Downloading package snowball_data to
[nltk_data]     /home/stepy/nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/stepy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming ###

In [19]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english", ignore_stopwords=True)
print('running --> %s' % stemmer.stem("running"))
print('generously --> %s' %stemmer.stem("generously"))



running --> run
generously --> generous


In [20]:
# Use stemming in the vectorization process

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

pipe_stemmed = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()), 
    ('dtc', DecisionTreeClassifier())
])

pipe_stemmed = pipe_stemmed.fit(train.data, train.target)

predicted_stemmed = pipe_stemmed.predict(test.data)

print('Accuracy after stemming: %s' % np.mean(predicted_stemmed == test.target))

Accuracy after stemming: 0.5695698353690919


# 4. Improving results

In [38]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

class LemmatizedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [35]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

best_pipeline = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()), 
    ('clf', SGDClassifier(loss='hinge', random_state=2137, max_iter=10, tol=None))
], memory=".cache")

In [36]:
cross_val_score(best_pipeline, train.data, train.target)

array([0.92, 0.92, 0.92, 0.92, 0.92])