In [1]:
print("hello world")

hello world


In [17]:
# get some libraries that will be useful
import re
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.image as mpimg

# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder

# Outputs the headlines dataset.
# X is a list of headlines
# y is a list of binary labels, 0 stands for Haaretz and 1 for Israel Hayom
def load_dataset(filenames=['haaretz.csv','israelhayom.csv']):
    cur_y = 0
    X = pd.DataFrame()
    y = np.empty(0,dtype=np.int32)
    for filename in filenames:
        train_cur = pd.read_csv(filename, header=None)
        X = pd.concat([X,train_cur[0]])
        y = np.append(y,cur_y*np.ones(len(train_cur),dtype=np.int32))
        cur_y += 1
    X = [x[0] for x in X.values.tolist()]
    y = y.tolist()
    return X,y

# grab the data
X,Y = load_dataset()


In [18]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

In [19]:
X = [normalize_text(s) for s in X]
X

['london rabbi preaches inclusivity toward gay people and sets off an uproar',
 "u.k.'s may announces she will form government day after losing parliamentary majority",
 'theresa may heads to buckingham palace in bid to form government',
 'trump calls comey a leaker after former fbi boss testimony',
 'british young people take revenge for the brexit sin',
 "israel's defense minister hamas commander planning attacks against israel from lebanon",
 'theresa may to seek permission to form government after losing majority',
 "working in israel's defense industry a leftist's dilemma ",
 'low marriage rates and intermarriage threaten future of u.s jewry report warns',
 'hate crime suspected after graffiti slashed tires found in arab jerusalem neighborhood',
 'why isis is thanking trump for his response to its terror attack on iran ',
 'the experiment that shows under what circumstances we could peacefully live with the palestinians',
 'u.k election what the hung parliament result means for br

In [28]:
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(X)
encoder = LabelEncoder()
y = encoder.fit_transform(Y)

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# take a look at the shape of each of these
print(np.array(X).shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(6558,)
(5246, 8853)
(5246,)
(1312, 8853)
(1312,)


In [29]:
print(np.array(X).shape)


(6558,)


In [40]:
nb = MultinomialNB(alpha=1)
nb.fit(x_train, y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [39]:
nb.score(x_test, y_test)

0.78810975609756095

In [25]:
coefs = nb.coef_
print(coefs.shape)
print(coefs)

(1, 8853)
[[ -6.93478548 -12.76666795  -7.81790806 ..., -12.76666795 -10.36877268
  -10.36877268]]


In [13]:
def make_reverse_vocabulary(vectorizer):
    revvoc = {}

    vocab = vectorizer.vocabulary_
    for w in vocab:
        i = vocab[w]

        revvoc[i] = w

    return revvoc

In [15]:
make_reverse_vocabulary(vectorizer)

{0: '000',
 1: '007',
 2: '10',
 3: '100',
 4: '100b',
 5: '100m',
 6: '100th',
 7: '101',
 8: '102',
 9: '11',
 10: '110',
 11: '110b',
 12: '115',
 13: '118',
 14: '11th',
 15: '12',
 16: '120',
 17: '121',
 18: '12m',
 19: '12th',
 20: '13',
 21: '133m',
 22: '13m',
 23: '14',
 24: '140',
 25: '148k',
 26: '148th',
 27: '149',
 28: '14th',
 29: '15',
 30: '15m',
 31: '15th',
 32: '16',
 33: '167',
 34: '16th',
 35: '17',
 36: '171',
 37: '174',
 38: '178m',
 39: '18',
 40: '180',
 41: '187',
 42: '19',
 43: '1926',
 44: '1928',
 45: '1935',
 46: '1948',
 47: '1950s',
 48: '196',
 49: '1967',
 50: '1970s',
 51: '1972',
 52: '1973',
 53: '1980s',
 54: '1984',
 55: '1987',
 56: '1988',
 57: '1989',
 58: '1993',
 59: '1995',
 60: '1997',
 61: '1b',
 62: '1m',
 63: '20',
 64: '200',
 65: '2001',
 66: '2002',
 67: '2006',
 68: '2008',
 69: '2009',
 70: '200m',
 71: '2010',
 72: '2011',
 73: '2012',
 74: '2013',
 75: '2014',
 76: '2015',
 77: '2016',
 78: '2017',
 79: '2018',
 80: '202',
 