In [11]:
from pylab import *
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import matplotlib.image as mpimg
import os
import urllib
from copy import deepcopy
# Constants
# -----------------------------------------------------------------------------

In [12]:
# Functions used in multiple parts
# -----------------------------------------------------------------------------
def getNews(seed = 0):
    
    fakeFile = open("clean_fake.txt",'r')
    fakeNews = fakeFile.read().splitlines()
    fakeFile.close()
    realFile = open("clean_real.txt",'r')
    realNews = realFile.read().splitlines()
    realFile.close()
    
    np.random.shuffle(fakeNews)
    np.random.shuffle(realNews)
    
    return fakeNews, realNews

In [13]:
def dataSetSplit(fakeNews, realNews,seed = 0):
    
    trainingSet ={'real': realNews[:int(0.7*len(realNews))], 'fake':fakeNews[:int(0.7*len(fakeNews))]} 
    validationSet = {'real':realNews[int(0.7*len(realNews)):int(0.85*len(realNews))], 'fake':fakeNews[int(0.7*len(fakeNews)):int(0.85*len(fakeNews))]}
    testSet = {'real': realNews[int(0.85*len(realNews)):] , 'fake': fakeNews[int(0.85*len(fakeNews)):]}

    return trainingSet, validationSet, testSet

In [14]:
def generateWordBase(trainingSet):
    wordBase = []
    for rof in trainingSet.keys():
        for news in trainingSet[rof]:
            for word in news.split(' '):
                wordBase.append(word)
    return list(set(wordBase))

In [58]:
def generateData(wordBase, targetSet):
    xy = np.zeros([len(wordBase)+1, (len(targetSet['real'])+len(targetSet['fake']))])
    n = 0
    for rof in ['real', 'fake']:
        for news in targetSet[rof]:
            for word in news.split(' '):
                xy[wordBase.index(word),n] = 1
            if rof == 'real':
                xy[-1,n] = 1
            n += 1
    return xy

In [23]:
def f(x,t,W):
    o = np.matmul(W.T,x)
    y = 1/(1+np.exp(-o))
    return sum(-t*np.log(y) - (1-t)*np.log(1-y))

def df(x,t,W):
    o = np.matmul(W.T,x)
    y = 1/(1+np.exp(-o))
    lossGradient = y-t
    return np.matmul(x, lossGradient.T)

In [None]:
def sigmoidForward(y):
    return 1/(1+exp(-y))

def fcForward(x,W):
    return np.matmul(W.T,x)
    
def forward(x, W):
    L1 = fcForward(x,W)
    output = sigmoidForward(L1)
    return output

def sigmoidBackward(y,output):
    return output-y

def fcBackward(dy,x):
    dW = np.matmul(x,dy.T)
    return dW

def backward(y,output,x):
    dy = sigmoidBackward(y,output)
    dW = fcBackward(dy,x)
    return dW

def NLL(y, output):
    return -sum(y*log(output)+(1-y)*log((1-output)))

def backwardReg(y,output,x, W, lam):
    dy = sigmoidBackward(y,output)
    dW = fcBackward(dy,x)
    dWR = 2*lam*W
    return dW + dWR

def NLLReg(y,output, lam, W):
    return -sum(y*log(output)+(1-y)*log((1-output)))+ sum(lam*W)

In [51]:
def grad_descent(f, df, x, y, init_t, alpha=0.001,max_iter=10000, EPS = 1e-5):
    
    prev_t = init_t-10*EPS
    t = init_t.copy()
    iter  = 0

    while norm(t - prev_t) >  EPS and iter < max_iter:
        
        #normal gradient descent
        prev_t = t.copy()
        t -= alpha*df(x,y,t)
        
        if iter % 5 == 0:
            print "Iter", iter
            print f(x,y,t)
        
        iter += 1
    
    return t

In [27]:
def performance(x, y, W, print_output = True,report = "test set"):
    output = forward(x,W)
    result = 0.
    print x.shape
    for i in range(x.shape[1]):
        if (output[0,i]>0.5 and y[i]==1):
            result+=1
        elif (output[0,i]<0.5 and y[i]==0):
            result += 1
            
    if print_output:
        print "Performance on "+report+ ": " +str(result)+"/"+str(x.shape[1])+"\n"
    
    return result/x.shape[1]

In [61]:
np.random.seed(1)
fakeNews, realNews = getNews()
trainingSet, validationSet, testSet = dataSetSplit(fakeNews, realNews)
wordBase = generateWordBase(trainingSet)

training_xy = generateData(wordBase, trainingSet)

tpp not dead after trump win advocate says


ValueError: ' ' is not in list

In [52]:
W = np.random.normal(0.,0.,[len(wordBase),1])
trainedW = grad_descent(f, df, training_xy[:-1,:], training_xy[-1,:], W,alpha=0.0001)

Iter 0
1546.2573510834595
Iter 5
1522.2373263735537
Iter 10
1505.1649621736635
Iter 15
1490.2199621434852
Iter 20
1477.1136966702807
Iter 25
1465.5964536004285
Iter 30
1455.452931155075
Iter 35
1446.4979785998353
Iter 40
1438.5726196740925
Iter 45
1431.540448254552
Iter 50
1425.2844283624247
Iter 55
1419.7040948070824
Iter 60
1414.7131301272461
Iter 65
1410.2372830686732
Iter 70
1406.2125899507832
Iter 75
1402.583860345404
Iter 80
1399.3033907840493
Iter 85
1396.329873603291
Iter 90
1393.6274718337672
Iter 95
1391.165034825699
Iter 100
1388.9154328567404
Iter 105
1386.854992177818
Iter 110
1384.963014781562
Iter 115
1383.2213696293475
Iter 120
1381.6141441721422
Iter 125
1380.1273467828687
Iter 130
1378.748652222661
Iter 135
1377.4671835283098
Iter 140
1376.2733247685696
Iter 145
1375.1585600044136
Iter 150
1374.115334530244
Iter 155
1373.1369350931536
Iter 160
1372.2173863056685
Iter 165
1371.3513609009137
Iter 170
1370.534101842011
Iter 175
1369.7613546015812
Iter 180
1369.0293081823

Iter 1485
1335.22367397471
Iter 1490
1335.1855479761332
Iter 1495
1335.1476019477009
Iter 1500
1335.1098346457577
Iter 1505
1335.0722448389818
Iter 1510
1335.0348313082138
Iter 1515
1334.9975928462907
Iter 1520
1334.9605282578816
Iter 1525
1334.9236363593263
Iter 1530
1334.8869159784776
Iter 1535
1334.8503659545454
Iter 1540
1334.8139851379437
Iter 1545
1334.777772390142
Iter 1550
1334.7417265835156
Iter 1555
1334.7058466012027
Iter 1560
1334.6701313369606
Iter 1565
1334.6345796950266
Iter 1570
1334.5991905899805
Iter 1575
1334.5639629466082
Iter 1580
1334.5288956997704
Iter 1585
1334.493987794271
Iter 1590
1334.4592381847287
Iter 1595
1334.424645835451
Iter 1600
1334.39020972031
Iter 1605
1334.35592882262
Iter 1610
1334.321802135018
Iter 1615
1334.2878286593455
Iter 1620
1334.2540074065314
Iter 1625
1334.2203373964796
Iter 1630
1334.186817657955
Iter 1635
1334.1534472284745
Iter 1640
1334.1202251541965
Iter 1645
1334.0871504898164
Iter 1650
1334.0542222984584
Iter 1655
1334.0214396515

Iter 2920
1328.7710957103213
Iter 2925
1328.758172858853
Iter 2930
1328.74528995754
Iter 2935
1328.732446838168
Iter 2940
1328.7196433334175
Iter 2945
1328.7068792768584
Iter 2950
1328.694154502944
Iter 2955
1328.6814688470063
Iter 2960
1328.6688221452487
Iter 2965
1328.6562142347416
Iter 2970
1328.6436449534156
Iter 2975
1328.6311141400572
Iter 2980
1328.6186216343026
Iter 2985
1328.6061672766318
Iter 2990
1328.5937509083642
Iter 2995
1328.581372371652
Iter 3000
1328.5690315094755
Iter 3005
1328.556728165638
Iter 3010
1328.5444621847605
Iter 3015
1328.5322334122752
Iter 3020
1328.5200416944217
Iter 3025
1328.5078868782418
Iter 3030
1328.4957688115737
Iter 3035
1328.4836873430468
Iter 3040
1328.4716423220775
Iter 3045
1328.459633598864
Iter 3050
1328.4476610243805
Iter 3055
1328.4357244503733
Iter 3060
1328.4238237293548
Iter 3065
1328.4119587146006
Iter 3070
1328.4001292601422
Iter 3075
1328.3883352207638
Iter 3080
1328.3765764519976
Iter 3085
1328.364852810118
Iter 3090
1328.35316415

Iter 4355
1326.226924278219
Iter 4360
1326.2209371644424
Iter 4365
1326.2149639799104
Iter 4370
1326.2090046799376
Iter 4375
1326.203059220019
Iter 4380
1326.1971275558294
Iter 4385
1326.1912096432225
Iter 4390
1326.18530543823
Iter 4395
1326.1794148970603
Iter 4400
1326.1735379760978
Iter 4405
1326.167674631903
Iter 4410
1326.1618248212108
Iter 4415
1326.155988500929
Iter 4420
1326.150165628139
Iter 4425
1326.144356160094
Iter 4430
1326.1385600542183
Iter 4435
1326.1327772681068
Iter 4440
1326.1270077595236
Iter 4445
1326.121251486402
Iter 4450
1326.1155084068428
Iter 4455
1326.109778479114
Iter 4460
1326.1040616616503
Iter 4465
1326.0983579130516
Iter 4470
1326.0926671920827
Iter 4475
1326.0869894576722
Iter 4480
1326.0813246689122
Iter 4485
1326.075672785057
Iter 4490
1326.0700337655223
Iter 4495
1326.0644075698854
Iter 4500
1326.0587941578829
Iter 4505
1326.0531934894111
Iter 4510
1326.0476055245254
Iter 4515
1326.042030223438
Iter 4520
1326.0364675465191
Iter 4525
1326.03091745429

Iter 5795
1324.941940166521
Iter 5800
1324.9386441550582
Iter 5805
1324.935354230459
Iter 5810
1324.9320703770732
Iter 5815
1324.9287925793026
Iter 5820
1324.9255208215986
Iter 5825
1324.9222550884654
Iter 5830
1324.9189953644566
Iter 5835
1324.9157416341764
Iter 5840
1324.9124938822795
Iter 5845
1324.9092520934712
Iter 5850
1324.9060162525059
Iter 5855
1324.9027863441881
Iter 5860
1324.899562353372
Iter 5865
1324.8963442649606
Iter 5870
1324.8931320639065
Iter 5875
1324.889925735211
Iter 5880
1324.886725263924
Iter 5885
1324.8835306351446
Iter 5890
1324.8803418340194
Iter 5895
1324.8771588457435
Iter 5900
1324.8739816555606
Iter 5905
1324.870810248761
Iter 5910
1324.8676446106838
Iter 5915
1324.8644847267146
Iter 5920
1324.8613305822869
Iter 5925
1324.858182162881
Iter 5930
1324.8550394540237
Iter 5935
1324.851902441289
Iter 5940
1324.8487711102975
Iter 5945
1324.8456454467157
Iter 5950
1324.8425254362564
Iter 5955
1324.8394110646784
Iter 5960
1324.8363023177865
Iter 5965
1324.8331991

Iter 7230
1324.1963715474035
Iter 7235
1324.1943357972216
Iter 7240
1324.1923031559063
Iter 7245
1324.190273616839
Iter 7250
1324.188247173418
Iter 7255
1324.1862238190593
Iter 7260
1324.184203547198
Iter 7265
1324.1821863512866
Iter 7270
1324.1801722247956
Iter 7275
1324.1781611612128
Iter 7280
1324.1761531540446
Iter 7285
1324.1741481968143
Iter 7290
1324.172146283063
Iter 7295
1324.1701474063502
Iter 7300
1324.168151560251
Iter 7305
1324.1661587383605
Iter 7310
1324.1641689342891
Iter 7315
1324.1621821416652
Iter 7320
1324.160198354135
Iter 7325
1324.1582175653612
Iter 7330
1324.156239769024
Iter 7335
1324.154264958821
Iter 7340
1324.1522931284658
Iter 7345
1324.1503242716901
Iter 7350
1324.1483583822426
Iter 7355
1324.1463954538876
Iter 7360
1324.1444354804075
Iter 7365
1324.1424784556011
Iter 7370
1324.1405243732838
Iter 7375
1324.1385732272877
Iter 7380
1324.136625011461
Iter 7385
1324.13467971967
Iter 7390
1324.1327373457957
Iter 7395
1324.1307978837367
Iter 7400
1324.1288613274

Iter 8680
1323.7140373948023
Iter 8685
1323.71268444879
Iter 8690
1323.711333261993
Iter 8695
1323.7099838312276
Iter 8700
1323.708636153317
Iter 8705
1323.707290225092
Iter 8710
1323.7059460433911
Iter 8715
1323.70460360506
Iter 8720
1323.703262906951
Iter 8725
1323.701923945925
Iter 8730
1323.700586718849
Iter 8735
1323.6992512225981
Iter 8740
1323.697917454054
Iter 8745
1323.6965854101063
Iter 8750
1323.6952550876508
Iter 8755
1323.6939264835914
Iter 8760
1323.6925995948388
Iter 8765
1323.691274418311
Iter 8770
1323.6899509509324
Iter 8775
1323.6886291896358
Iter 8780
1323.6873091313598
Iter 8785
1323.685990773051
Iter 8790
1323.6846741116624
Iter 8795
1323.683359144154
Iter 8800
1323.6820458674936
Iter 8805
1323.6807342786549
Iter 8810
1323.6794243746194
Iter 8815
1323.678116152375
Iter 8820
1323.676809608917
Iter 8825
1323.675504741247
Iter 8830
1323.674201546374
Iter 8835
1323.6729000213136
Iter 8840
1323.671600163088
Iter 8845
1323.6703019687268
Iter 8850
1323.6690054352662
Iter

In [53]:
performance(training_xy[:-1,:], training_xy[-1,:], trainedW, print_output = True,report = "training set")

(4855L, 2285L)
Performance on training set: 1598.0/2285



0.6993435448577681

In [55]:
wordBase

['pardon',
 'hats',
 'child',
 'pide',
 'colleges',
 'everybody',
 'manaforts',
 'obstruction',
 'protest',
 'controversial',
 'hanging',
 'protestors',
 'liar',
 'battleground',
 'hate',
 'assembled',
 'marching',
 'stinks',
 'looking',
 'votes',
 'voter',
 'pointing',
 'dazu',
 'paris',
 'tweet',
 'tonya',
 'amtsantritt',
 'investigation',
 'voted',
 'under',
 'teaching',
 'sorry',
 'sway',
 'forbidden',
 'divergent',
 'evangelical',
 'updated',
 'risk',
 'commerce',
 'smacks',
 'regional',
 'advisors',
 'replaces',
 'every',
 'govern',
 'affect',
 'exporters',
 'disneys',
 'improve',
 'trumpisms',
 'tickets',
 'school',
 'snowflakes',
 'debunks',
 'skills',
 'supports',
 'companies',
 'solution',
 'piling',
 'roadblock',
 'pardons',
 'sacks',
 'el',
 'warmbiers',
 'heading',
 'triumph',
 'clothes',
 'uncovers',
 'bill',
 'force',
 'leaders',
 '10m',
 'miller',
 'warns',
 'aides',
 'bridgitte',
 'budget',
 'chef',
 'messy',
 'likely',
 'street',
 '49',
 'streep',
 'trumpprotest',
 'a