In [1]:
import re
import csv
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import nltk
from collections import Counter 

# Lecture des données et construction des train, dev et test sets

In [2]:
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [3]:
##On transforme les 3 colonnes en 1 : url, titre et le texte deviennent un seul élément
def read_data(filename):
    data = []
    labels = []
    with open(filename, encoding="utf8", newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            txt = clean_str(row[2])
            txt = " ".join(txt.split("\n"))
#             d = clean_str(row[0])+" "+clean_str(row[1])
            labels.append(row[3])
            data.append(txt)    
    data.pop(0)
    labels.pop(0)
    labels = [float(l) for l in labels]
    return np.array(data), np.array(labels)

In [4]:
data, labels = read_data("data.csv")

In [5]:
print(data[0])

image copyright getty images on sunday morning , donald trump went off on a twitter tirade against a member of his own party this , in itself , is n't exactly huge news it 's far from the first time the president has turned his rhetorical cannons on his own ranks this time , however , his attacks were particularly biting and personal he essentially called tennessee senator bob corker , the chair of the powerful senate foreign relations committee , a coward for not running for re election he said mr corker begged for the president 's endorsement , which he refused to give he wrongly claimed that mr corker 's support of the iranian nuclear agreement was his only political accomplishment unlike some of his colleagues , mr corker free from having to worry about his immediate political future did n't hold his tongue skip twitter post by senbobcorker it 's a shame the white house has become an adult day care center someone obviously missed their shift this morning senator bob corker \( senbo

In [6]:
print(data.shape)
fake_data = data[labels==0]
real_data = data[labels==1]
fake_labels = labels[labels==0]
real_labels = labels[labels==1]
print("Nombre de fake news : {}".format(len(fake_data)))
print("Nombre de news correctes : {}".format(len(real_data)))


(4009,)
Nombre de fake news : 2137
Nombre de news correctes : 1872


In [7]:
print(fake_data.shape)

(2137,)


In [75]:
train_data = np.array([*real_data[:int(len(real_data)*0.8)],*fake_data[:int(len(fake_data)*0.8)]])
train_labels = np.array([*real_labels[:int(len(real_labels)*0.8)],*fake_labels[:int(len(fake_labels)*0.8)]], dtype=np.float32)

dev_data = np.array([*real_data[int(len(real_data)*0.8):int(len(real_data)*0.9)], *fake_data[int(len(fake_data)*0.8):int(len(fake_data)*0.9)]])
dev_labels = np.array([*real_labels[int(len(real_labels)*0.8):int(len(real_labels)*0.9)], *fake_labels[int(len(fake_labels)*0.8):int(len(fake_labels)*0.9)]], dtype=np.float32)

test_data = np.array([*real_data[int(len(real_data)*0.9):], *fake_data[int(len(fake_data)*0.9):]])
test_labels = np.array([*real_labels[int(len(real_labels)*0.9):], *fake_labels[int(len(fake_labels)*0.9):]], dtype=np.float32)

In [76]:
print("taille du train set : {}".format(len(train_data)))
print("taille du dev set : {}".format(len(dev_data)))
print("taille du test set : {}".format(len(test_data)))

taille du train set : 3206
taille du dev set : 401
taille du test set : 402


# Construction du vocabulaire

In [77]:
txt = [w for txt in train_data for w in txt.split()]
          
# token = nltk.word_tokenize(txt)
words = Counter(txt)
words = sorted(words, key=words.get, reverse=True)
words = ['_PAD','_UNK'] + words
vocab_size = len(words)
word2idx = {o:i for i,o in enumerate(words)}
word2idx["UNK"]=len(word2idx)
print("vocab size : ",vocab_size)

vocab size :  43464


In [78]:
train_data = [[word2idx[w] for w in news.split()] for news in train_data]
dev_data = [[word2idx[w] if w in word2idx else 0 for w in news.split()] for news in dev_data]
test_data = [[word2idx[w] if w in word2idx else 0 for w in news.split()] for news in test_data]

In [79]:
def pad_input(data, seq_len):
    features = np.zeros((len(data), seq_len), dtype=int)
    for ii, review in enumerate(data):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

# Padding
## on pad les input pour pouvoir créer des datasets pytorch et simplifier l'apprentissage (fonction de train plus claire, batch plus facile, etc)
### on peut tester plusieurs taille de padding : input le plus grand, moyen, min, ou arbitraire du genre 200, 500, etc. On peut aussi ainsi réduire le temps de train.

In [80]:
# train_seq_len = max([len(s) for s in train_data])
# dev_seq_len = max([len(s) for s in dev_data])
# test_seq_len = max([len(s) for s in test_data])

# train_data = pad_input(train_data, train_seq_len)
# dev_data = pad_input(dev_data, dev_seq_len)
# test_data = pad_input(test_data, test_seq_len)

seq_len = 200


train_data = pad_input(train_data, seq_len)
dev_data = pad_input(dev_data, seq_len)
test_data = pad_input(test_data, seq_len)

In [81]:
np.mean([len(data) for data in test_data])

200.0

In [82]:
train_data = TensorDataset(th.from_numpy(train_data).type(th.LongTensor), th.from_numpy(train_labels))
dev_data = TensorDataset(th.from_numpy(dev_data).type(th.LongTensor), th.from_numpy(dev_labels))
test_data = TensorDataset(th.from_numpy(test_data).type(th.LongTensor), th.from_numpy(test_labels))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [83]:
train_data[0]

(tensor([  170,   473,   953,   448,    11,   231,   800,     3,   389,    71,
           340,   141,    11,     7,   213, 13668,    98,     7,   781,     6,
            28,   219,   331,    22,     3,     8,   964,     3,    10,   138,
          1349,  1029,    83,    13,    30,   330,    31,     2,    67,    79,
             2,   100,    29,   870,    28, 16815, 23000,    11,    28,   219,
          4326,    22,    79,     3,   233,     3,    28,   918,    57,  1475,
          9716,     5,   843,    15,  2621,   221,  1564,  1476,  1913,  1076,
             3,     2,  4012,     6,     2,   844,   886,   405,  1467,   576,
             3,     7, 13669,    12,    36,   323,    12,   192,   449,    15,
            24,    93,  1076, 16816,    12,     2,   100,    30,  6528,     3,
            52,    15,  2138,     4,   381,    15, 11681,  1661,     9,    93,
          1076,    30,   314,     6,     2,  2316,   430,   772,    17,    28,
            96,   310, 19421,  2704,    77,     6,  

# CBOW Classifieur

In [91]:
class L_CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(L_CBOW_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.l1 = nn.Linear(embedding_dim, 1)
        
        nn.init.xavier_uniform_(self.l1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.l1.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        inputs = th.sum(inputs, dim=1)
        out = th.sigmoid(self.l1(inputs))
        return out

In [279]:
class C_CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(C_CBOW_classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(1, 20, 5, 1)
        self.conv2 = nn.Conv1d(20, 50, 5, 1)
        self.dense1 = nn.Linear(450, 250)
        self.dense2 = nn.Linear(250, 64)
        self.dense3 = nn.Linear(64, 1)

    def forward(self, x):

        x = self.embeddings(x)
        x = th.sum(x, dim=0)
        x = x.view(1, 1,-1)
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2, 2)
        x = x.view(-1, 450)
        x = F.relu(self.dense1(x))
        x = F.relu(self.dense2(x))
        x = self.dense3(x)
        return th.sigmoid(x)

In [101]:
loss_fn = nn.BCELoss()
lr = 1e-2
l_cbow = L_CBOW_classifier(len(word2idx), 50)
train_accuracies = []
train_losses = []
dev_accuracies = []
dev_losses = []

In [102]:
def train_cbow(model, max_epochs=20):
    optim = th.optim.Adam(params=model.parameters(), lr =lr, weight_decay=1e-4)
#     model.train()
    
    for e in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0
        
        n=0
        for x, labels in train_loader:
            n+=batch_size
            preds = model(x)
            loss = loss_fn(preds, labels)

            train_mean_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optim.step()
            
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    train_accuracy+=1
                
            if(n%250==0):
                print("step : {}/{} ".format(n, len(train_data)))
                print("step accuracy : ", train_accuracy/n)
                print("step loss : ", train_mean_loss/n)
                
        ###Dev test
        for x, labels in dev_loader:
            preds = model(x)
            loss = loss_fn(preds, labels)
            dev_mean_loss+=loss.item()
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    dev_accuracy+=1
                    
        train_accuracies.append(train_accuracy/len(train_data))
        train_losses.append(train_mean_loss/len(train_data))
        dev_accuracies.append(dev_accuracy/len(dev_data))
        dev_losses.append(dev_mean_loss/len(dev_data))
        
        
        print("EPOCH {}".format(e+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")

In [103]:
train_cbow(l_cbow)

step : 250/3206 
step accuracy :  0.548
step loss :  0.20660722541809082
step : 500/3206 
step accuracy :  0.506
step loss :  0.2283901300430298
step : 750/3206 
step accuracy :  0.52
step loss :  0.21716612434387206
step : 1000/3206 
step accuracy :  0.53
step loss :  0.20853510999679564
step : 1250/3206 
step accuracy :  0.5568
step loss :  0.19285156593322753
step : 1500/3206 
step accuracy :  0.5873333333333334
step loss :  0.1816171559492747
step : 1750/3206 
step accuracy :  0.6182857142857143
step loss :  0.16733430515016828
step : 2000/3206 
step accuracy :  0.646
step loss :  0.1544108639359474
step : 2250/3206 
step accuracy :  0.6648888888888889
step loss :  0.14597464587953354
step : 2500/3206 
step accuracy :  0.6792
step loss :  0.14017971453666686
step : 2750/3206 
step accuracy :  0.6912727272727273
step loss :  0.13515727559002963
step : 3000/3206 
step accuracy :  0.703
step loss :  0.1286831189393997
step : 3250/3206 
step accuracy :  0.7052307692307692
step loss :  

step : 2750/3206 
step accuracy :  0.9305454545454546
step loss :  0.010173204259912546
step : 3000/3206 
step accuracy :  0.9346666666666666
step loss :  0.009508234262667732
step : 3250/3206 
step accuracy :  0.9236923076923077
step loss :  0.008931037207952333
EPOCH 7
Train Accuracy :  0.9363693075483468
Dev Accuracy :  0.9650872817955112
Train Mean loss :  0.009053609147175633
Dev Mean loss :  0.006722585625593502
----------------------------------------
step : 250/3206 
step accuracy :  0.96
step loss :  0.00451282175257802
step : 500/3206 
step accuracy :  0.974
step loss :  0.0026117917404098988
step : 750/3206 
step accuracy :  0.9786666666666667
step loss :  0.001971786082881105
step : 1000/3206 
step accuracy :  0.982
step loss :  0.002237326961690087
step : 1250/3206 
step accuracy :  0.9808
step loss :  0.0024277603169750362
step : 1500/3206 
step accuracy :  0.9786666666666667
step loss :  0.002610813499992825
step : 1750/3206 
step accuracy :  0.968
step loss :  0.0033961

step : 1250/3206 
step accuracy :  0.9416
step loss :  0.007750582356614177
step : 1500/3206 
step accuracy :  0.9486666666666667
step loss :  0.006632874651620417
step : 1750/3206 
step accuracy :  0.9508571428571428
step loss :  0.006386711270650267
step : 2000/3206 
step accuracy :  0.95
step loss :  0.006761917960802748
step : 2250/3206 
step accuracy :  0.948
step loss :  0.007120151124870366
step : 2500/3206 
step accuracy :  0.9472
step loss :  0.006927837281652319
step : 2750/3206 
step accuracy :  0.9487272727272728
step loss :  0.006499538244024734
step : 3000/3206 
step accuracy :  0.9473333333333334
step loss :  0.006744286443282666
step : 3250/3206 
step accuracy :  0.9366153846153846
step loss :  0.006363823967989167
EPOCH 14
Train Accuracy :  0.9494697442295695
Dev Accuracy :  0.8703241895261845
Train Mean loss :  0.00645116278726288
Dev Mean loss :  0.015027890590360932
----------------------------------------
step : 250/3206 
step accuracy :  0.952
step loss :  0.00311

In [105]:
print(max(dev_accuracies))

0.970074812967581


In [178]:
def test_cbow(model, test_data, test_labels):
    acc = 0
    test_mean_loss = 0
    
    for x, labels in test_loader:
            
            preds = model(x)
            loss = loss_fn(preds, labels)
            test_mean_loss+=loss.item()
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    acc+=1
    print("Test accuracy : ", acc/len(test_data))
    print("Test mean loss : ", test_mean_loss/len(test_data))

In [179]:
test_cbow(l_cbow, test_data, test_labels)

Test accuracy :  0.9303482587064676
Test mean loss :  0.010245549515705204


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


# Un classifieur RNN classique (LSTM ou GRU) 

In [161]:
class RNN_classifier(nn.Module):
    def __init__(self, nb_cells, hidden_size, vocab_size, embedding_dim, rnn_dropout, is_lstm=False):
        super(RNN_classifier, self).__init__()
        
        self.nb_cells = nb_cells
        self.hidden_size = hidden_size
        self.is_lstm = is_lstm
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if is_lstm:
            self.rnn = nn.LSTM(embedding_dim, hidden_size, nb_cells, batch_first = True, dropout=rnn_dropout)
        else:
            self.rnn = nn.GRU(embedding_dim, hidden_size, nb_cells, batch_first = True, dropout=rnn_dropout)
            
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.2)
        nn.init.xavier_uniform_(self.fc.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.fc.bias.data)  # Xavier/Glorot init for tanh
        
    
    def forward(self, inputs, hidden):
        batch_size = inputs.size(0)
        embeds = self.embedding(inputs)
        rnn_out, hidden = self.rnn(embeds, hidden)
        
        out = self.dropout(rnn_out)
        out = self.fc(out)
        out = th.sigmoid(out)
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = th.Tensor(self.nb_cells, batch_size, self.hidden_size)
        if self.is_lstm:
            hidden = (weight.new(self.nb_cells, batch_size, self.hidden_size).zero_(),
                      weight.new(self.nb_cells, batch_size, self.hidden_size).zero_())
        else:
            hidden = weight.new(self.nb_cells, batch_size, self.hidden_size).zero_()
            
        return hidden
    

In [175]:
##Hyper-paramètres
nb_cells = 1
hidden_size = 32
embedding_dim = 10
learning_rate = 1e-2
fn = nn.BCELoss()
m = RNN_classifier(nb_cells, hidden_size, vocab_size, embedding_dim, 0.0, is_lstm=False)
print(m)

RNN_classifier(
  (embedding): Embedding(43464, 10)
  (rnn): GRU(10, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [176]:
def train_rnn(model, batch_size,lr, max_epochs=10, conv=False):
    optim = th.optim.Adam(params=model.parameters(), lr=lr, weight_decay=1e-4)
    model.train()
    best_dict = model.state_dict()
    best_acc = 0
    for i in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0

        n = 0 

        for x, labels in train_loader:
            loc_batch_size = x.size(0)
            h = model.init_hidden(loc_batch_size)
            
            optim.zero_grad()
            h = h.data
            
            n+=loc_batch_size
            
            preds, _ = model(x, h)  
            loss = fn(preds, labels)
            h = h.detach()
            train_mean_loss+=loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optim.step()

            for pred, label in zip(preds, labels):
                if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                    train_accuracy+=1
        
            if(n%250==0):
                print("step : {}/{} ".format(n, len(train_data)))
                print("step accuracy : ", train_accuracy/n)
                print("step loss : ", train_mean_loss/n)
                
        for x, labels in dev_loader:
            loc_batch_size = x.size(0)
            h = model.init_hidden(loc_batch_size)
            preds, _ = model(x, h)
            loss = fn(preds, labels)

            dev_mean_loss+=loss.item()
            
            for pred, label in zip(preds, labels):
                if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                    dev_accuracy+=1
        
        if(i==0):
            best_dict=model.state_dict()
            best_acc=dev_accuracy/len(dev_data)
        else:
            if(best_acc<(dev_accuracy/len(dev_data))):
                best_acc=dev_accuracy/len(dev_data)
                best_dict=model.state_dict()
                print("new best acc")
        
        print("EPOCH {}".format(i+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")
        
#     model.load_state_dict(best_dict)
#     acc = test(model, test_data, conv=conv)
#     stat_dict[model_name][4].append(acc)
#     print("Accuracy on test data : ", acc)
#     return best_dict

In [181]:
def test_rnn(model, test_data, test_labels):
    acc = 0
    test_mean_loss = 0
    
    for x, labels in test_loader:
            loc_batch_size = x.size(0)
            h = model.init_hidden(loc_batch_size)
            preds, _ = model(x, h)
            loss = loss_fn(preds, labels)
            test_mean_loss+=loss.item()
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    acc+=1
    print("Test accuracy : ", acc/len(test_data))
    print("Test mean loss : ", test_mean_loss/len(test_data))

In [177]:
train_rnn(m, batch_size, learning_rate)

step : 250/3206 
step accuracy :  0.532
step loss :  0.013646665334701538
step : 500/3206 
step accuracy :  0.5
step loss :  0.013993316888809204
step : 750/3206 
step accuracy :  0.5373333333333333
step loss :  0.013630117575327555
step : 1000/3206 
step accuracy :  0.569
step loss :  0.013293004035949707
step : 1250/3206 
step accuracy :  0.5744
step loss :  0.013205519247055054
step : 1500/3206 
step accuracy :  0.5806666666666667
step loss :  0.013121230999628702
step : 1750/3206 
step accuracy :  0.5851428571428572
step loss :  0.01304795581953866
step : 2000/3206 
step accuracy :  0.6
step loss :  0.01291685077548027
step : 2250/3206 
step accuracy :  0.6053333333333333
step loss :  0.012796302053663465
step : 2500/3206 
step accuracy :  0.61
step loss :  0.012681925320625306
step : 2750/3206 
step accuracy :  0.6134545454545455
step loss :  0.012617540034380826
step : 3000/3206 
step accuracy :  0.624
step loss :  0.012419666796922683
EPOCH 1
Train Accuracy :  0.632252027448534


step : 500/3206 
step accuracy :  0.994
step loss :  0.00015721046202816068
step : 750/3206 
step accuracy :  0.9946666666666667
step loss :  0.00013710196921601893
step : 1000/3206 
step accuracy :  0.996
step loss :  0.00012514296313747764
step : 1250/3206 
step accuracy :  0.9968
step loss :  0.00011659997222013772
step : 1500/3206 
step accuracy :  0.9973333333333333
step loss :  0.00010092151479329914
step : 1750/3206 
step accuracy :  0.9977142857142857
step loss :  9.553280765456812e-05
step : 2000/3206 
step accuracy :  0.9975
step loss :  0.0001297775499697309
step : 2250/3206 
step accuracy :  0.9946666666666667
step loss :  0.00032492826317643956
step : 2500/3206 
step accuracy :  0.994
step loss :  0.00035029116829391567
step : 2750/3206 
step accuracy :  0.9927272727272727
step loss :  0.0004027137356513942
step : 3000/3206 
step accuracy :  0.9933333333333333
step loss :  0.0003857906954168963
new best acc
EPOCH 8
Train Accuracy :  0.9931378665003119
Dev Accuracy :  0.985

In [180]:
test_rnn(m, test_data, test_labels)

NameError: name 'test_rnn' is not defined