In [11]:
from pylab import *
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import matplotlib.image as mpimg
import os
import urllib
from copy import deepcopy
# Constants
# -----------------------------------------------------------------------------

In [12]:
# Functions used in multiple parts
# -----------------------------------------------------------------------------
def getNews(seed = 0):
    
    fakeFile = open("clean_fake.txt",'r')
    fakeNews = fakeFile.read().splitlines()
    fakeFile.close()
    realFile = open("clean_real.txt",'r')
    realNews = realFile.read().splitlines()
    realFile.close()
    
    np.random.shuffle(fakeNews)
    np.random.shuffle(realNews)
    
    return fakeNews, realNews

In [64]:
def dataSetSplit(fakeNews, realNews,seed = 0):
    
    trainingSet ={'real': realNews[:int(0.7*len(realNews))], 'fake':fakeNews[:int(0.7*len(fakeNews))]} 
    validationSet = {'real':realNews[int(0.7*len(realNews)):int(0.85*len(realNews))], 'fake':fakeNews[int(0.7*len(fakeNews)):int(0.85*len(fakeNews))]}
    testSet = {'real': realNews[int(0.85*len(realNews)):] , 'fake': fakeNews[int(0.85*len(fakeNews)):]}

    return trainingSet, validationSet, testSet

In [65]:
def generateWordBase(trainingSet):
    wordBase = []
    for rof in trainingSet.keys():
        for news in trainingSet[rof]:
            for word in news.split(' '):
                wordBase.append(word)
    return list(set(wordBase))

In [66]:
def generateData(wordBase, targetSet):
    xy = np.zeros([len(wordBase)+1, (len(targetSet['real'])+len(targetSet['fake']))])
    n = 0
    for rof in ['real', 'fake']:
        for news in targetSet[rof]:
            for word in news.split(' '):
                xy[wordBase.index(word),n] = 1
            if rof == 'real':
                xy[-1,n] = 1
            n += 1
    return xy

In [67]:
def f(x,t,W):
    o = np.matmul(W.T,x)
    y = 1/(1+np.exp(-o))
    return sum(-t*np.log(y) - (1-t)*np.log(1-y))

def df(x,t,W):
    o = np.matmul(W.T,x)
    y = 1/(1+np.exp(-o))
    lossGradient = y-t
    return np.matmul(x, lossGradient.T)

In [68]:
def sigmoidForward(y):
    return 1/(1+exp(-y))

def fcForward(x,W):
    return np.matmul(W.T,x)
    
def forward(x, W):
    L1 = fcForward(x,W)
    output = sigmoidForward(L1)
    return output

def sigmoidBackward(y,output):
    return output-y

def fcBackward(dy,x):
    dW = np.matmul(x,dy.T)
    return dW

def backward(y,output,x):
    dy = sigmoidBackward(y,output)
    dW = fcBackward(dy,x)
    return dW

def NLL(y, output):
    return -sum(y*log(output)+(1-y)*log((1-output)))

def backwardReg(y,output,x, W, lam):
    dy = sigmoidBackward(y,output)
    dW = fcBackward(dy,x)
    dWR = 2*lam*W
    return dW + dWR

def NLLReg(y,output, lam, W):
    return -sum(y*log(output)+(1-y)*log((1-output)))+ sum(lam*W)

In [69]:
def grad_descent(f, df, x, y, init_t, alpha=0.001,max_iter=10000, EPS = 1e-5):
    
    prev_t = init_t-10*EPS
    t = init_t.copy()
    iter  = 0

    while norm(t - prev_t) >  EPS and iter < max_iter:
        
        #normal gradient descent
        prev_t = t.copy()
        t -= alpha*df(x,y,t)
        
        if iter % 5 == 0:
            print "Iter", iter
            print f(x,y,t)
        
        iter += 1
    
    return t

In [70]:
def performance(x, y, W, print_output = True,report = "test set"):
    output = forward(x,W)
    result = 0.
    print x.shape
    for i in range(x.shape[1]):
        if (output[0,i]>0.5 and y[i]==1):
            result+=1
        elif (output[0,i]<0.5 and y[i]==0):
            result += 1
            
    if print_output:
        print "Performance on "+report+ ": " +str(result)+"/"+str(x.shape[1])+"\n"
    
    return result/x.shape[1]

In [71]:
np.random.seed(1)
fakeNews, realNews = getNews()
trainingSet, validationSet, testSet = dataSetSplit(fakeNews, realNews)
wordBase = generateWordBase(trainingSet)

training_xy = generateData(wordBase, trainingSet)

In [72]:
W = np.random.normal(0.,0.,[len(wordBase),1])
trainedW = grad_descent(f, df, training_xy[:-1,:], training_xy[-1,:], W,alpha=0.0001)

Iter 0
1573.11023164203
Iter 5
1527.5524179960044
Iter 10
1491.0432618851873
Iter 15
1460.0440749564339
Iter 20
1432.7588541074776
Iter 25
1408.2222091188792
Iter 30
1385.8701605603922
Iter 35
1365.3391921340417
Iter 40
1346.3715183499867
Iter 45
1328.7696402198048
Iter 50
1312.3738529605123
Iter 55
1297.0505355917012
Iter 60
1282.685587989733
Iter 65
1269.1803962130664
Iter 70
1256.4490994692433
Iter 75
1244.4165762484613
Iter 80
1233.0168659128371
Iter 85
1222.1918813452485
Iter 90
1211.8903339520482
Iter 95
1202.0668239829365
Iter 100
1192.6810650346547
Iter 105
1183.6972201456638
Iter 110
1175.0833319277372
Iter 115
1166.810832491046
Iter 120
1158.8541213190306
Iter 125
1151.1902011195903
Iter 130
1143.7983632034598
Iter 135
1136.6599152133786
Iter 140
1129.7579451025144
Iter 145
1123.0771161724415
Iter 150
1116.6034887554636
Iter 155
1110.324364783751
Iter 160
1104.2281520458462
Iter 165
1098.3042454043784
Iter 170
1092.5429226500123
Iter 175
1086.93525300668
Iter 180
1081.4730165

Iter 1500
642.7269445589264
Iter 1505
642.017765611784
Iter 1510
641.3109614920136
Iter 1515
640.6065168366139
Iter 1520
639.9044164310677
Iter 1525
639.2046452074454
Iter 1530
638.5071882425384
Iter 1535
637.8120307560225
Iter 1540
637.1191581086493
Iter 1545
636.4285558004676
Iter 1550
635.7402094690716
Iter 1555
635.0541048878765
Iter 1560
634.3702279644219
Iter 1565
633.6885647387011
Iter 1570
633.009101381515
Iter 1575
632.3318241928546
Iter 1580
631.656719600305
Iter 1585
630.983774157476
Iter 1590
630.3129745424559
Iter 1595
629.6443075562895
Iter 1600
628.9777601214784
Iter 1605
628.3133192805046
Iter 1610
627.6509721943771
Iter 1615
626.9907061411985
Iter 1620
626.3325085147549
Iter 1625
625.6763668231259
Iter 1630
625.0222686873169
Iter 1635
624.370201839909
Iter 1640
623.7201541237318
Iter 1645
623.072113490554
Iter 1650
622.4260679997942
Iter 1655
621.7820058172503
Iter 1660
621.1399152138474
Iter 1665
620.4997845644049
Iter 1670
619.8616023464202
Iter 1675
619.225357138870

Iter 2970
498.2966573049562
Iter 2975
497.9458374348638
Iter 2980
497.595642624705
Iter 2985
497.24607083087835
Iter 2990
496.8971200199326
Iter 2995
496.5487881684991
Iter 3000
496.20107326322466
Iter 3005
495.853973300705
Iter 3010
495.50748628741815
Iter 3015
495.1616102396595
Iter 3020
494.8163431834763
Iter 3025
494.47168315460334
Iter 3030
494.1276281983989
Iter 3035
493.7841763697815
Iter 3040
493.44132573316693
Iter 3045
493.09907436240564
Iter 3050
492.7574203407212
Iter 3055
492.41636176064833
Iter 3060
492.0758967239731
Iter 3065
491.7360233416712
Iter 3070
491.39673973384924
Iter 3075
491.0580440296843
Iter 3080
490.71993436736614
Iter 3085
490.3824088940375
Iter 3090
490.04546576573716
Iter 3095
489.70910314734203
Iter 3100
489.3733192125099
Iter 3105
489.03811214362344
Iter 3110
488.70348013173316
Iter 3115
488.3694213765027
Iter 3120
488.03593408615313
Iter 3125
487.70301647740797
Iter 3130
487.3706667754394
Iter 3135
487.03888321381373
Iter 3140
486.70766403443827
Iter 

Iter 4415
416.8851339081887
Iter 4420
416.6572470971928
Iter 4425
416.4296493154128
Iter 4430
416.20233992530814
Iter 4435
415.9753182914378
Iter 4440
415.7485837804513
Iter 4445
415.52213576107926
Iter 4450
415.295973604124
Iter 4455
415.0700966824506
Iter 4460
414.84450437097763
Iter 4465
414.61919604666804
Iter 4470
414.3941710885201
Iter 4475
414.1694288775585
Iter 4480
413.94496879682526
Iter 4485
413.7207902313709
Iter 4490
413.4968925682457
Iter 4495
413.2732751964906
Iter 4500
413.0499375071288
Iter 4505
412.82687889315696
Iter 4510
412.60409874953615
Iter 4515
412.38159647318406
Iter 4520
412.1593714629656
Iter 4525
411.93742311968504
Iter 4530
411.71575084607696
Iter 4535
411.4943540467985
Iter 4540
411.2732321284202
Iter 4545
411.05238449941857
Iter 4550
410.8318105701669
Iter 4555
410.6115097529279
Iter 4560
410.3914814618448
Iter 4565
410.1717251129336
Iter 4570
409.9522401240749
Iter 4575
409.73302591500567
Iter 4580
409.5140819073116
Iter 4585
409.2954075244187
Iter 4590

Iter 5885
360.17246089265484
Iter 5890
360.0087403706904
Iter 5895
359.84518415372247
Iter 5900
359.6817919651327
Iter 5905
359.5185635289837
Iter 5910
359.3554985700166
Iter 5915
359.19259681364906
Iter 5920
359.0298579859727
Iter 5925
358.8672818137512
Iter 5930
358.7048680244181
Iter 5935
358.5426163460741
Iter 5940
358.3805265074856
Iter 5945
358.2185982380819
Iter 5950
358.0568312679534
Iter 5955
357.8952253278493
Iter 5960
357.7337801491755
Iter 5965
357.57249546399237
Iter 5970
357.4113710050125
Iter 5975
357.2504065055989
Iter 5980
357.08960169976285
Iter 5985
356.9289563221613
Iter 5990
356.7684701080953
Iter 5995
356.6081427935078
Iter 6000
356.4479741149812
Iter 6005
356.28796380973586
Iter 6010
356.12811161562763
Iter 6015
355.9684172711457
Iter 6020
355.80888051541103
Iter 6025
355.64950108817385
Iter 6030
355.4902787298118
Iter 6035
355.3312131813279
Iter 6040
355.1723041843485
Iter 6045
355.0135514811214
Iter 6050
354.85495481451363
Iter 6055
354.6965139280096
Iter 6060


Iter 7345
318.38065231560233
Iter 7350
318.2554150982299
Iter 7355
318.1302832645894
Iter 7360
318.00525666947755
Iter 7365
317.8803351679787
Iter 7370
317.75551861546495
Iter 7375
317.63080686759463
Iter 7380
317.5061997803121
Iter 7385
317.3816972098468
Iter 7390
317.25729901271245
Iter 7395
317.13300504570634
Iter 7400
317.0088151659087
Iter 7405
316.88472923068184
Iter 7410
316.76074709766937
Iter 7415
316.63686862479557
Iter 7420
316.5130936702647
Iter 7425
316.38942209256004
Iter 7430
316.2658537504433
Iter 7435
316.14238850295413
Iter 7440
316.01902620940893
Iter 7445
315.8957667294005
Iter 7450
315.7726099227971
Iter 7455
315.6495556497417
Iter 7460
315.52660377065183
Iter 7465
315.40375414621786
Iter 7470
315.28100663740327
Iter 7475
315.1583611054433
Iter 7480
315.0358174118446
Iter 7485
314.91337541838425
Iter 7490
314.79103498710936
Iter 7495
314.6687959803361
Iter 7500
314.5466582606491
Iter 7505
314.424621690901
Iter 7510
314.30268613421117
Iter 7515
314.1808514539657
Ite

Iter 8815
285.56745421683536
Iter 8820
285.4679594937602
Iter 8825
285.3685373149919
Iter 8830
285.269187595487
Iter 8835
285.1699102503439
Iter 8840
285.07070519480277
Iter 8845
284.9715723442453
Iter 8850
284.8725116141943
Iter 8855
284.77352292031327
Iter 8860
284.6746061784064
Iter 8865
284.57576130441794
Iter 8870
284.47698821443214
Iter 8875
284.3782868246728
Iter 8880
284.2796570515029
Iter 8885
284.1810988114245
Iter 8890
284.0826120210783
Iter 8895
283.9841965972435
Iter 8900
283.88585245683714
Iter 8905
283.7875795169142
Iter 8910
283.68937769466703
Iter 8915
283.5912469074252
Iter 8920
283.49318707265513
Iter 8925
283.3951981079598
Iter 8930
283.2972799310785
Iter 8935
283.1994324598864
Iter 8940
283.10165561239444
Iter 8945
283.00394930674895
Iter 8950
282.9063134612312
Iter 8955
282.80874799425754
Iter 8960
282.71125282437856
Iter 8965
282.6138278702791
Iter 8970
282.5164730507782
Iter 8975
282.41918828482807
Iter 8980
282.3219734915147
Iter 8985
282.2248285900568
Iter 899

In [73]:
performance(training_xy[:-1,:], training_xy[-1,:], trainedW, print_output = True,report = "training set")

(4855L, 2285L)
Performance on training set: 2242.0/2285



0.9811816192560175

In [55]:
wordBase

['pardon',
 'hats',
 'child',
 'pide',
 'colleges',
 'everybody',
 'manaforts',
 'obstruction',
 'protest',
 'controversial',
 'hanging',
 'protestors',
 'liar',
 'battleground',
 'hate',
 'assembled',
 'marching',
 'stinks',
 'looking',
 'votes',
 'voter',
 'pointing',
 'dazu',
 'paris',
 'tweet',
 'tonya',
 'amtsantritt',
 'investigation',
 'voted',
 'under',
 'teaching',
 'sorry',
 'sway',
 'forbidden',
 'divergent',
 'evangelical',
 'updated',
 'risk',
 'commerce',
 'smacks',
 'regional',
 'advisors',
 'replaces',
 'every',
 'govern',
 'affect',
 'exporters',
 'disneys',
 'improve',
 'trumpisms',
 'tickets',
 'school',
 'snowflakes',
 'debunks',
 'skills',
 'supports',
 'companies',
 'solution',
 'piling',
 'roadblock',
 'pardons',
 'sacks',
 'el',
 'warmbiers',
 'heading',
 'triumph',
 'clothes',
 'uncovers',
 'bill',
 'force',
 'leaders',
 '10m',
 'miller',
 'warns',
 'aides',
 'bridgitte',
 'budget',
 'chef',
 'messy',
 'likely',
 'street',
 '49',
 'streep',
 'trumpprotest',
 'a