In [2]:
#import 20newsgroups datasets from sklearn
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',remove=(['headers','footers', 'quotes']))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
#list of categories(labels)
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

#counts the occurence of each word. 
count_vect = CountVectorizer()

#second one exclude stopwords, like 'the','of'..
count_vect2= CountVectorizer(stop_words='english') 

X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts2 = count_vect2.fit_transform(twenty_train.data)

print(X_train_counts.shape)
print(X_train_counts2.shape)

# count_vect.vocabulary_.get(u'algorithm')

(11314, 101631)
(11314, 101322)


In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
#transform from occurrences to tf-idf 
#tf-idf: 1. Decide the number of occurrences of each word in a document by total number of words in document. (Term Frequences tf)
#        2. It does another refinement. Downscaling weights for words that occurring in many documents.
tfidf_transformer = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()

#use X_train_tfidf to train the MODEL
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)

X_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = X_test.data
#use docs_test to fit.
print(X_train_tfidf.shape)
print(X_train_tfidf2.shape)

(11314, 101631)
(11314, 101322)


In [0]:
import pandas as pd

# get the first document
first_vector=X_train_tfidf[13]
first_vector2=X_train_tfidf2[13]
 
# show the TF-IDF scores , compare with/without stopwords
df = pd.DataFrame(first_vector.T.todense(), index=count_vect.get_feature_names(), columns=["tfidf"])
df2 = pd.DataFrame(first_vector2.T.todense(), index=count_vect2.get_feature_names(), columns=["tfidf_stopwords"])



In [7]:
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
ssf,0.356347
flights,0.298625
option,0.242772
capability,0.242602
the,0.216754
...,...
discern,0.000000
discarded,0.000000
discard,0.000000
discarcina,0.000000


In [8]:
df2.sort_values(by=["tfidf_stopwords"],ascending=False)

Unnamed: 0,tfidf_stopwords
ssf,0.383239
flights,0.321161
option,0.261093
capability,0.260910
module,0.224627
...,...
disappoint,0.000000
disappering,0.000000
disappears,0.000000
disappearing,0.000000


# Models

## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
text_clf_lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression(multi_class = 'multinomial')),])
text_clf_lr.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,


## SVM


In [13]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_svm', SGDClassifier()),])
text_clf_svm.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
text_clf_rf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_rf', RandomForestClassifier()),])
text_clf_rf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

## Evaluate

In [16]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', remove=(['headers','footers', 'quotes']))
docs_test = twenty_test.data

print("Logistic Regresssion:")
predicted_lr = text_clf_lr.predict(docs_test)
print(np.mean(predicted_lr == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_lr, target_names=twenty_test.target_names))

print("SVM:")
predicted_svm = text_clf_svm.predict(docs_test)
print(np.mean(predicted_svm == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_svm, target_names=twenty_test.target_names))

print("Random Forest:")
predicted_rf = text_clf_rf.predict(docs_test)
print(np.mean(predicted_rf == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_rf, target_names=twenty_test.target_names))

Logistic Regresssion:
0.6736590546999469
                          precision    recall  f1-score   support

             alt.atheism       0.47      0.45      0.46       319
           comp.graphics       0.62      0.70      0.66       389
 comp.os.ms-windows.misc       0.66      0.61      0.63       394
comp.sys.ibm.pc.hardware       0.66      0.62      0.64       392
   comp.sys.mac.hardware       0.74      0.66      0.70       385
          comp.windows.x       0.82      0.68      0.74       395
            misc.forsale       0.70      0.80      0.75       390
               rec.autos       0.71      0.70      0.70       396
         rec.motorcycles       0.71      0.78      0.74       398
      rec.sport.baseball       0.50      0.83      0.63       397
        rec.sport.hockey       0.89      0.85      0.87       399
               sci.crypt       0.85      0.65      0.74       396
         sci.electronics       0.55      0.62      0.58       393
                 sci.med       0.7

In [0]:
 #   Training a classifier example:
 #   from sklearn.naive_bayes import MultinomialNB
 #   clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
    


In [0]:
#  Predict an outcome on new document:
#  Doc_examples = ['God is love', 'GPU is fast']

#  convert documents into tf-idf.  
#  X_new_counts = count_vect.transform(Doc_examples) 
#  X_new_tfidf = tfidf_transformer.transform(X_new_counts)


#  predicted = clf.predict(X_new_tfidf)   (name_of_your_model_object)
#  it stores the prediction of doc_examples.


