# New York Times Article Analysis

Yupeng Yang

Jan 24, 2019


## Natural Language Processing

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk.data
import numpy as np

In [2]:
# If you are running NLTK for the first time, try download NLTK stopwords
# uncomment to run below:
# nltk.download("stopwords")

# Optionally, you can also download all data from NLTK, this step takes time
# As you as you download NLTK once, the data will reside in your machine
# uncomment to run below:
# nltk.download("all")  # optional

In [3]:
categories = ['comp.graphics', 'rec.sport.baseball', 'sci.med', 'talk.politics.misc']
data = fetch_20newsgroups(subset='train', categories=categories).data
newsgroups = fetch_20newsgroups(subset='train', categories=categories)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
newsgroups

{'data': ['From: geb@cs.pitt.edu (Gordon Banks)\nSubject: Re: Name of MD\'s eyepiece?\nReply-To: geb@cs.pitt.edu (Gordon Banks)\nOrganization: Univ. of Pittsburgh Computer Science\nLines: 13\n\nIn article <C4IHM2.Gs9@watson.ibm.com> clarke@watson.ibm.com (Ed Clarke) writes:\n>|> |It\'s not an eyepiece.  It is called a head mirror.  All doctors never\n>\n>A speculum?\n\nThe speculum is the little cone that fits on the end of the otoscope.\nThere are also vaginal specula that females and gynecologists are\nall too familiar with.\n-- \n----------------------------------------------------------------------------\nGordon Banks  N3JXP      | "Skepticism is the chastity of the intellect, and\ngeb@cadre.dsl.pitt.edu   |  it is shameful to surrender it too soon." \n----------------------------------------------------------------------------\n',
  'Subject: Let it be Known\nFrom: <ISSBTL@BYUVM.BITNET>\nOrganization: Brigham Young University\nLines: 10\n\nI would like to make everyone aware that 

In [6]:
type(data), len(data)

(list, 2240)

In [10]:
data[0]

'From: geb@cs.pitt.edu (Gordon Banks)\nSubject: Re: Name of MD\'s eyepiece?\nReply-To: geb@cs.pitt.edu (Gordon Banks)\nOrganization: Univ. of Pittsburgh Computer Science\nLines: 13\n\nIn article <C4IHM2.Gs9@watson.ibm.com> clarke@watson.ibm.com (Ed Clarke) writes:\n>|> |It\'s not an eyepiece.  It is called a head mirror.  All doctors never\n>\n>A speculum?\n\nThe speculum is the little cone that fits on the end of the otoscope.\nThere are also vaginal specula that females and gynecologists are\nall too familiar with.\n-- \n----------------------------------------------------------------------------\nGordon Banks  N3JXP      | "Skepticism is the chastity of the intellect, and\ngeb@cadre.dsl.pitt.edu   |  it is shameful to surrender it too soon." \n----------------------------------------------------------------------------\n'

In [11]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from string import punctuation

In [13]:
from nltk.stem.porter   import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet  import WordNetLemmatizer

print(SnowballStemmer('english').stem('running'))
print(WordNetLemmatizer().lemmatize('caused'))

run
caused


In [14]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
vectors = vectorizer.fit_transform(data).toarray()
words = vectorizer.get_feature_names()

In [30]:
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
words

['00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '10',
 '100',
 '11',
 '12',
 '128',
 '129',
 '13',
 '130',
 '14',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '192',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1993apr13',
 '1993apr14',
 '1993apr15',
 '1993apr16',
 '1993apr17',
 '1993apr19',
 '1993apr5',
 '1993apr6',
 '20',
 '200',
 '21',
 '22',
 '23',
 '24',
 '241',
 '25',
 '250',
 '256',
 '26',
 '27',
 '28',
 '286',
 '29',
 '2b',
 '2d',
 '2nd',
 '30',
 '300',
 '31',
 '312',
 '32',
 '33',
 '333',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '3d',
 '3do',
 '40',
 '400',
 '408',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '48',
 '49',
 '50',
 '500',
 '51',
 '52',
 '53',
 '54',
 '56',
 '59',
 '60',
 '61',
 '617',
 '64',
 '65',
 '70',
 '75',
 '80',
 '800',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '95',
 '9760',
 '__',
 '___',
 '____',
 'aaa',
 'ab',
 'ability',
 'able',
 'abortion',
 'absolutely',
 'abuse',
 'ac',
 'academic',
 'accept',
 'accepted',
 'access',
 

In [19]:
def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]

In [29]:
np.argsort([7, 3, 2, 4, 1])[::-1][:3]

array([0, 3, 1])

In [20]:
avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
print("top 10 by average tf-idf")
print(get_top_values(avg, 10, words))

top 10 by average tf-idf
['limbaugh', 'jb', 'mwra', 'radiosity', 'zisfein', 'oswego', 'stephanopoulos', 'den', 'journalism', 'crohn']


In [31]:
total = np.sum(vectors, axis=0)
print("top 10 by total tf-idf")
print(get_top_values(total, 10, words))

top 10 by total tf-idf
['edu', 'com', 'subject', 'lines', 'organization', 'article', 'writes', 'university', 'cs', 'posting']


In [32]:
# redo vectorization without using idf
vectorizer2 = TfidfVectorizer(use_idf=False, max_features=2000)
# make documents into one giant document for this purpose
vectors2 = vectorizer2.fit_transform(["\n".join(data)]).toarray()
print("top 10 by tf across all corpus")
print(get_top_values(vectors2[0], 10, words))

top 10 by tf across all corpus
['talking', 'thf2', 'ny', 'american', 'ii', 'int', 'talk', 'interesting', 'feel', 'duke']


In [33]:
all_newsgroups = fetch_20newsgroups()
all_data = np.array(all_newsgroups.data)

for i, category in enumerate(all_newsgroups.target_names):
    data = all_data[all_newsgroups.target == i]
    vectorizer = TfidfVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(data).toarray()
    words = vectorizer.get_feature_names()
    print("Category: %s" % category)
    avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
    print("  Top 10 by average tf-idf")
    print("    %s" % ", ".join(get_top_values(avg, 10, words)))
    total = np.sum(vectors, axis=0)
    print("  Top 10 by total tf-idf")
    print("    %s" % ", ".join(get_top_values(total, 10, words)))
    print("-----------------------------")

Category: alt.atheism
  Top 10 by average tf-idf
    enviroleague, dlb, vonnegut, b12, tyre, racism, ites, rb, maine, bye
  Top 10 by total tf-idf
    edu, com, keith, god, people, caltech, writes, don, sgi, livesey
-----------------------------
Category: comp.graphics
  Top 10 by average tf-idf
    xxxx, sphinx, p_c, kewageshig, siemens, sink, bates, bockamp, stereoscopic, newcastle
  Top 10 by total tf-idf
    edu, graphics, com, lines, subject, organization, university, posting, host, nntp
-----------------------------
Category: comp.os.ms-windows.misc
  Top 10 by average tf-idf
    donoghue, ramirez, osburn, maley, drum, bekker, toelle, ax, srini, schwenk
  Top 10 by total tf-idf
    edu, windows, com, file, dos, lines, subject, organization, university, ax
-----------------------------
Category: comp.sys.ibm.pc.hardware
  Top 10 by average tf-idf
    netbios, dh, interliving, gamecards, ren, btr, erlangen, harding, satam, oracle
  Top 10 by total tf-idf
    edu, scsi, com, drive, 

## Documents Classification

### Load Train Data

In [34]:
data = fetch_20newsgroups(subset='train', categories=categories).data
target = fetch_20newsgroups(subset='train', categories=categories).target

In [35]:
len(data), type(data)

(2240, list)

In [36]:
target.shape

(2240,)

### Vectorize documents

In [37]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
vectors = vectorizer.fit_transform(data).toarray()
words = vectorizer.get_feature_names()

In [38]:
vectors.shape

(2240, 2000)

In [39]:
X = vectors
y = target

### Load and Transform Test Data

In [40]:
test_data = fetch_20newsgroups(subset='test', categories=categories).data
test_target = fetch_20newsgroups(subset='test', categories=categories).target

In [60]:
y_test = test_target
X_test = vectorizer.transform(test_data)


### Build classifiers with sklearn 

#### Let's first try Logisitc Regression

In [42]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
model.score(X, y)

0.9861607142857143

In [44]:
model.coef_.shape

(4, 2000)

In [45]:
# Get top words that makes prediciton of such a category
num_category = 0

print(categories[num_category])

get_top_values(model.coef_[num_category], 10, words)

comp.graphics


['graphics',
 'image',
 '3d',
 'files',
 'images',
 'file',
 '3do',
 'windows',
 'points',
 'software']

In [46]:
# Get score for training set
model.score(X, y)

0.9861607142857143

In [47]:
# Get score for test set
model.score(X_test, y_test)

0.925603217158177

#### Naive Bayes Classifier

In [48]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [49]:
# Get score for training set
model.score(X, y)

0.9714285714285714

In [50]:
# Get score for test set
model.score(X_test, y_test)

0.9175603217158177

In [51]:
X.shape

(2240, 2000)

#### Randorm Forest Classifier

In [52]:
# use one vs rest classifier for multi-class classification
from sklearn.multiclass import OneVsRestClassifier

In [53]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(n_estimators=200,max_depth=50,min_samples_leaf=3, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
          n_jobs=-1)

In [54]:
# Get score for training set
model.score(X, y)

0.9924107142857143

In [55]:
# Get score for test set
model.score(X_test, y_test)

0.925603217158177

#### Gradient Boosted Trees

In [56]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=5,min_samples_leaf=2, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

OneVsRestClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False),
          n_jobs=-1)

In [57]:
# Get score for training set
model.score(X, y)

0.9986607142857142

In [58]:
# Get score for test set
model.score(X_test, y_test)

0.8806970509383378