In [0]:
#import 20newsgroups datasets from sklearn
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',remove=(['headers','footers', 'quotes']))

In [2]:
#list of categories(labels)
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

#counts the occurence of each word. 
count_vect = CountVectorizer()

#second one exclude stopwords, like 'the','of'..
count_vect2= CountVectorizer(stop_words='english') 

X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts2 = count_vect2.fit_transform(twenty_train.data)

print(X_train_counts.shape)
print(X_train_counts2.shape)

# count_vect.vocabulary_.get(u'algorithm')

(11314, 101631)
(11314, 101322)


In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
#transform from occurrences to tf-idf 
#tf-idf: 1. Decide the number of occurrences of each word in a document by total number of words in document. (Term Frequences tf)
#        2. It does another refinement. Downscaling weights for words that occurring in many documents.
tfidf_transformer = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()

#use X_train_tfidf to train the MODEL
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)

X_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = X_test.data
#use docs_test to fit.
print(X_train_tfidf.shape)
print(X_train_tfidf2.shape)

(11314, 101631)
(11314, 101322)


In [0]:
import pandas as pd

# get the first document
first_vector=X_train_tfidf[13]
first_vector2=X_train_tfidf2[13]
 
# show the TF-IDF scores , compare with/without stopwords
df = pd.DataFrame(first_vector.T.todense(), index=count_vect.get_feature_names(), columns=["tfidf"])
df2 = pd.DataFrame(first_vector2.T.todense(), index=count_vect2.get_feature_names(), columns=["tfidf_stopwords"])



In [6]:
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
ssf,0.356347
flights,0.298625
option,0.242772
capability,0.242602
the,0.216754
...,...
discern,0.000000
discarded,0.000000
discard,0.000000
discarcina,0.000000


In [7]:
df2.sort_values(by=["tfidf_stopwords"],ascending=False)

Unnamed: 0,tfidf_stopwords
ssf,0.383239
flights,0.321161
option,0.261093
capability,0.260910
module,0.224627
...,...
disappoint,0.000000
disappering,0.000000
disappears,0.000000
disappearing,0.000000


# Models

## Logistic Regression

In [8]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression(multi_class = 'multinomial')),])
text_clf_lr.fit(twenty_train.data, twenty_train.target)
# clf_lr = LogisticRegression().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,


### Hyperparameter Tuning

In [9]:
# Hyperparameter tuning using Randomized search
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
X = twenty_train.data
Y = twenty_train.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
# Create parameter for TfidfTransformer
use_idf = (True, False)

# Create lists of parameter for Logistic Regression Classifier:
penalty = ['l1', 'l2', 'elasticnet']
C = [0.001, 0.01, 0.1, 1, 10, 100]
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
class_weight.append(None)
solver = ['sag', 'saga']


n = len(X)
idx1 = np.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_lr__penalty = penalty, clf_lr__C = C, clf_lr__class_weight = class_weight, clf_lr__solver = solver);
# Call RandomizedSearchCV
rs_clf_lr = RandomizedSearchCV(text_clf_lr, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_lr.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_lr.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_lr.best_estimator_.get_params()['tfidf__use_idf'])
print('Best penalty:', rs_clf_lr.best_estimator_.get_params()['clf_lr__penalty'])
print('Best C:', rs_clf_lr.best_estimator_.get_params()['clf_lr__C'])
print('Best class_weight:', rs_clf_lr.best_estimator_.get_params()['clf_lr__class_weight'])
print('Best solver:', rs_clf_lr.best_estimator_.get_params()['clf_lr__solver'])
print(); print(rs_clf_lr.best_estimator_.get_params()['clf_lr'])



Best n-gram range: (1, 1)
Best use_idf: True
Best penalty: l2
Best C: 1
Best class_weight: None
Best solver: sag

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)


## SVM


In [15]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_svm', SGDClassifier()),])
text_clf_svm.fit(twenty_train.data, twenty_train.target)
# clf_svm = SGDClassifier().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

### Hyperparameter Tuning

In [26]:
# Create lists of parameter for SVM Classifier:
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1]
learning_rate = ['optimal', 'constant', 'adaptive']
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
class_weight.append(None)
eta0 = [0.01, 0.1, 0.5, 1, 100] 

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_svm__alpha = alpha, clf_svm__learning_rate = learning_rate, clf_svm__class_weight = class_weight, clf_svm__eta0 = eta0);
# Call RandomizedSearchCV
rs_clf_svm = RandomizedSearchCV(text_clf_svm, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_svm.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_svm.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_svm.best_estimator_.get_params()['tfidf__use_idf'])
print('Best alpha:', rs_clf_svm.best_estimator_.get_params()['clf_svm__alpha'])
print('Best learning_rate:', rs_clf_svm.best_estimator_.get_params()['clf_svm__learning_rate'])
print('Best class_weight:', rs_clf_svm.best_estimator_.get_params()['clf_svm__class_weight'])
print('Best eta0:', rs_clf_svm.best_estimator_.get_params()['clf_svm__eta0'])
print(); print(rs_clf_svm.best_estimator_.get_params()['clf_svm'])



Best n-gram range: (1, 2)
Best use_idf: True
Best alpha: 1e-05
Best learning_rate: constant
Best class_weight: None
Best eta0: 0.5

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.5, fit_intercept=True,
              l1_ratio=0.15, learning_rate='constant', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


## Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_rf', RandomForestClassifier()),])
text_clf_rf.fit(twenty_train.data, twenty_train.target)
# clf_rf = RandomForestClassifier().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Hyperparameter Tuning

In [21]:
# Create lists of parameter for Random Forest Classifier:
# Number of trees in random forest
n_estimators = [10, 100, 500, 750]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [45, 65, 95, 125]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range = ngram_range,
                  tfidf__use_idf = use_idf, clf_rf__n_estimators = n_estimators, clf_rf__max_features = max_features, clf_rf__max_depth=max_depth, clf_rf__min_samples_split = min_samples_split, clf_rf__min_samples_leaf = min_samples_leaf, clf_rf__bootstrap = bootstrap);
# Call RandomizedSearchCV
rs_clf_rf = RandomizedSearchCV(text_clf_rf, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, 
                            n_jobs=-1)
rs_clf_rf.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_rf.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_rf.best_estimator_.get_params()['tfidf__use_idf'])
print('Best n_estimators:', rs_clf_rf.best_estimator_.get_params()['clf_rf__n_estimators'])
print('Best max_features:', rs_clf_rf.best_estimator_.get_params()['clf_rf__max_features'])
print('Best max_depth:', rs_clf_rf.best_estimator_.get_params()['clf_rf__max_depth'])
print('Best min_samples_split:', rs_clf_rf.best_estimator_.get_params()['clf_rf__min_samples_split'])
print('Best min_samples_leaf:', rs_clf_rf.best_estimator_.get_params()['clf_rf__min_samples_leaf'])
print('Best bootstrap:', rs_clf_rf.best_estimator_.get_params()['clf_rf__bootstrap'])
print(); print(rs_clf_rf.best_estimator_.get_params()['clf_rf'])



Best n-gram range: (1, 2)
Best use_idf: False
Best n_estimators: 750
Best max_features: sqrt
Best max_depth: None
Best min_samples_split: 2
Best min_samples_leaf: 4
Best bootstrap: True

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


## Evaluate

In [27]:
twenty_test = fetch_20newsgroups(subset='test', remove=(['headers','footers', 'quotes']))
docs_test = twenty_test.data

print("Logistic Regresssion:")
predicted_lr = rs_clf_lr.predict(docs_test)
print(np.mean(predicted_lr == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_lr, target_names=twenty_test.target_names))

print("SVM:")
predicted_svm = rs_clf_svm.predict(docs_test)
print(np.mean(predicted_svm == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_svm, target_names=twenty_test.target_names))

print("Random Forest:")
predicted_rf = rs_clf_rf.predict(docs_test)
print(np.mean(predicted_rf == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_rf, target_names=twenty_test.target_names))

Logistic Regresssion:
0.6909187466808284
                          precision    recall  f1-score   support

             alt.atheism       0.49      0.45      0.47       319
           comp.graphics       0.63      0.71      0.67       389
 comp.os.ms-windows.misc       0.66      0.63      0.64       394
comp.sys.ibm.pc.hardware       0.68      0.65      0.66       392
   comp.sys.mac.hardware       0.75      0.69      0.72       385
          comp.windows.x       0.83      0.72      0.77       395
            misc.forsale       0.76      0.79      0.77       390
               rec.autos       0.75      0.71      0.73       396
         rec.motorcycles       0.48      0.81      0.61       398
      rec.sport.baseball       0.81      0.82      0.82       397
        rec.sport.hockey       0.90      0.86      0.88       399
               sci.crypt       0.89      0.67      0.76       396
         sci.electronics       0.56      0.61      0.59       393
                 sci.med       0.7

In [0]:
 #   Training a classifier example:
 #   from sklearn.naive_bayes import MultinomialNB
 #   clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
    


In [0]:
#  Predict an outcome on new document:
#  Doc_examples = ['God is love', 'GPU is fast']

#  convert documents into tf-idf.  
#  X_new_counts = count_vect.transform(Doc_examples) 
#  X_new_tfidf = tfidf_transformer.transform(X_new_counts)


#  predicted = clf.predict(X_new_tfidf)   (name_of_your_model_object)
#  it stores the prediction of doc_examples.


