In [1]:
# Notebook to explore data and complement Skafos text predictor

In [2]:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
from sklearn.datasets import fetch_20newsgroups

In [3]:
# Select training and testing data. This creates newsgroups_train and newsgroups_test as sklearn.utils.Bunch objects
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle='True')
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle='True')

In [4]:
# Explore the target names
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
# Example of what training data looks like
newsgroups_train.data[:2]

['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't an

In [15]:
# View how these texts are classified
newsgroups_train.target[:2]

array([7, 4])

In [9]:
# Use training data and scikit-learn feature extraction functions to create feature vectors from the text data
# Want TF-IDF weightings
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(newsgroups_train.data)
X_train_tfidf.shape

(11314, 101322)

In [16]:
# Look at results of TF-IDF weightings
print(X_train_tfidf)

  (0, 96879)	0.1370359812611726
  (0, 37256)	0.20599311323287348
  (0, 25717)	0.46579831435138963
  (0, 80420)	0.12706903967122096
  (0, 31927)	0.10526008886822913
  (0, 34741)	0.14847880131844232
  (0, 84312)	0.1636839250592851
  (0, 57247)	0.13520842471059058
  (0, 55606)	0.1382259698975382
  (0, 9843)	0.20797700857530219
  (0, 35902)	0.12667096041973439
  (0, 11174)	0.20599311323287348
  (0, 25437)	0.10548299054214268
  (0, 24108)	0.2472313451421643
  (0, 34742)	0.17300821242559042
  (0, 76269)	0.08978258481915571
  (0, 83208)	0.11339406589538421
  (0, 16806)	0.14077745547061019
  (0, 24583)	0.19644480500804057
  (0, 81450)	0.14613089342888969
  (0, 77676)	0.12197186951739483
  (0, 23430)	0.1293710328851233
  (0, 54493)	0.06961997844491916
  (0, 87913)	0.25808578247347563
  (0, 62594)	0.13037295035007845
  :	:
  (11313, 47085)	0.15717057502749704
  (11313, 81583)	0.15204670587770022
  (11313, 81534)	0.13599170815448167
  (11313, 96205)	0.12259690956628234
  (11313, 89541)	0.14649727

In [28]:
# Explore some of the names of the features
vectorizer.get_feature_names()[-2350]

'xtunmapwidget'

In [37]:
# Use a naive bayes classifier to predict targets
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
clf = MultinomialNB(alpha=0.01).fit(X_train_tfidf, newsgroups_train.target)
rfc = RandomForestClassifier(n_estimators=100).fit(X_train_tfidf, newsgroups_train.target)

In [38]:
clf.class_count_

array([480., 584., 591., 590., 578., 593., 585., 594., 598., 597., 600.,
       595., 591., 594., 593., 599., 546., 564., 465., 377.])

In [23]:
# Assess the performance of this classifier on test holdout sample
import numpy as np
from sklearn import metrics
X_test_tfidf = vectorizer.transform(newsgroups_test.data)
X_test_tfidf.shape
news_pred = clf.predict(X_test_tfidf)
accuracy = metrics.accuracy_score(newsgroups_test.target, news_pred)
print(accuracy)

0.7010090281465746


In [40]:
# Let's try to predict something
example_text = ["Has anyone seen the world series? The pitchers and batters are really excellent. Lots of stolen bases. I love a world series home run."]
ex_tfidf = vectorizer.transform(example_text)
ex_predict = clf.predict(ex_tfidf)
print(ex_predict)

[9]
