Load Data

In [4]:
import os
import tarfile
from contextlib import closing

try:
    from urllib import urlopen
except ImportError:
    from urllib.request import urlopen

URL = ("http://people.csail.mit.edu/jrennie/"
       "20Newsgroups/20news-bydate.tar.gz")

ARCHIVE_NAME = URL.rsplit('/', 1)[1]
TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"


if not os.path.exists(TRAIN_FOLDER) or not os.path.exists(TEST_FOLDER):

    if not os.path.exists(ARCHIVE_NAME):
        print("Downloading dataset from %s (14 MB)" % URL)
        opener = urlopen(URL)
        with open(ARCHIVE_NAME, 'wb') as archive:
            archive.write(opener.read())

    print("Decompressing %s" % ARCHIVE_NAME)
    with closing(tarfile.open(ARCHIVE_NAME, "r:gz")) as archive:
        archive.extractall(path='.')
os.remove(ARCHIVE_NAME)

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)
Decompressing 20news-bydate.tar.gz


In [5]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [6]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)


Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


The returned dataset is a scikit-learn “bunch”: a simple holder object with fields that can be both accessed as python dict keys or object attributes for convenience, for instance the target_names holds the list of the requested category names:

In [7]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [8]:
len(twenty_train.data)

2257

In [9]:
len(twenty_train.filenames)

2257

In [10]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

print(twenty_train.target_names[twenty_train.target[0]])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
comp.graphics


In [11]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [12]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


Load All Data into array

In [5]:
import json
from pprint import pprint
data_arr=[]
for i in range(1,14):
    with open('Data/stack_overflow'+str(i)+'.json') as data_file:    
        for line in data_file:
            data = json.loads(line)
            data['TAGS']=data['TAGS'].split('|')
            data_arr.append(data)
        
pprint(len(data_arr))

110000


In [None]:
Save Pickle

In [7]:
import os
from six.moves import cPickle as pickle
pickle_file =  'data.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'data_arr':data_arr,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise




In [None]:
Compress Pickle

In [8]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

Compressed pickle size: 12050794


# Load Pickle

In [3]:
from six.moves import cPickle as pickle
pickle_file = 'data.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  data_arr=save['data_arr']
  del save  # hint to help gc free up memory
#   print('Training set', train_dataset.shape, train_labels.shape)
#   print('Validation set', valid_dataset.shape, valid_labels.shape)
#   print('Test set', test_dataset.shape, test_labels.shape)

In [6]:
print(data_arr[0])
print(len(data_arr))

{'TITLE': 'DialogFragment not attached to activity', 'TAGS': ['android', 'android-fragments', 'android-dialogfragment']}
110000


BAG of words here.

Vectorizing

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


In [None]:
count_vect.vocabulary_.get(u'algorithm')

TF IDF step below

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape


In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


# # Classifier below

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

To use stochaistic or mini-batch algorithm above 
By either giving clf to next itr or choosing another

Now testing data below

In [None]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']#which the classifier has not seen
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))



Building a pipeline below and idk what that does

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

now training the data . in one line

In [None]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)#how 

apparently the pipeline doesn't do anything else other than just doing it all in one step


now checking the accuracy on the test data

In [None]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)            
##what is this?

checking for better accuracy on SVM classifier below by just changing the classifier type in import of classifier

In [None]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)            


For performance analysis of classifiers, using metrics provided by sk-learn

In [None]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

In [None]:
print (metrics.confusion_matrix(twenty_test.target, predicted))

paramerter tuning using grid search #what's that?

We’ve already encountered some parameters such as use_idf in the TfidfTransformer. Classifiers tend to have many parameters as well; e.g., MultinomialNB includes a smoothing parameter alpha and SGDClassifier has a penalty parameter alpha and configurable loss and penalty terms in the objective function (see the module documentation, or use the Python help function, to get a description of these).

Instead of tweaking the parameters of the various components of the chain, it is possible to run an exhaustive search of the best parameters on a grid of possible values. We try out all classifiers on either words or bigrams, with or without idf, and with a penalty parameter of either 0.01 or 0.001 for the linear SVM:

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

Obviously, such an exhaustive search can be expensive. If we have multiple CPU cores at our disposal, we can tell the grid searcher to try these eight parameter combinations in parallel with the n_jobs parameter. If we give this parameter a value of -1, grid search will detect how many cores are installed and uses them all:

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

below, performing search on a smaller subset of training data to speed up the computation

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

the result of calling fit on GridSearchCV object is a classifier on which we can then use the predict function 

In [None]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

The object’s best_score_ and best_params_ attributes store the best mean score and the parameters setting corresponding to that score:

In [None]:
gs_clf.best_score_                                  

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

