In [2]:
import re
import csv
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
#import nltk
from collections import Counter 

In [3]:
print(th.__version__)

1.3.1


# Lecture des données et construction des train, dev et test sets

In [5]:
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [6]:
##On transforme les 3 colonnes en 1 : url, titre et le texte deviennent un seul élément
def read_data(filename):
    data = []
    labels = []
    with open(filename, encoding="utf8", newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            txt = clean_str(row[2])
            txt = " ".join(txt.split("\n"))
#             d = clean_str(row[0])+" "+clean_str(row[1])
            labels.append(row[3])
            data.append(txt)    
    data.pop(0)
    labels.pop(0)
    labels = [float(l) for l in labels]
    return np.array(data), np.array(labels)

In [7]:
data, labels = read_data("data.csv")

In [8]:
print(data[0])

image copyright getty images on sunday morning , donald trump went off on a twitter tirade against a member of his own party this , in itself , is n't exactly huge news it 's far from the first time the president has turned his rhetorical cannons on his own ranks this time , however , his attacks were particularly biting and personal he essentially called tennessee senator bob corker , the chair of the powerful senate foreign relations committee , a coward for not running for re election he said mr corker begged for the president 's endorsement , which he refused to give he wrongly claimed that mr corker 's support of the iranian nuclear agreement was his only political accomplishment unlike some of his colleagues , mr corker free from having to worry about his immediate political future did n't hold his tongue skip twitter post by senbobcorker it 's a shame the white house has become an adult day care center someone obviously missed their shift this morning senator bob corker \( senbo

In [9]:
print(data.shape)
fake_data = data[labels==0]
real_data = data[labels==1]
fake_labels = labels[labels==0]
real_labels = labels[labels==1]
print("Nombre de fake news : {}".format(len(fake_data)))
print("Nombre de news correctes : {}".format(len(real_data)))


(4009,)
Nombre de fake news : 2137
Nombre de news correctes : 1872


In [10]:
print(fake_data.shape)

(2137,)


In [11]:
train_data = np.array([*real_data[:int(len(real_data)*0.8)],*fake_data[:int(len(fake_data)*0.8)]])
train_labels = np.array([*real_labels[:int(len(real_labels)*0.8)],*fake_labels[:int(len(fake_labels)*0.8)]], dtype=np.float32)

dev_data = np.array([*real_data[int(len(real_data)*0.8):int(len(real_data)*0.9)], *fake_data[int(len(fake_data)*0.8):int(len(fake_data)*0.9)]])
dev_labels = np.array([*real_labels[int(len(real_labels)*0.8):int(len(real_labels)*0.9)], *fake_labels[int(len(fake_labels)*0.8):int(len(fake_labels)*0.9)]], dtype=np.float32)

test_data = np.array([*real_data[int(len(real_data)*0.9):], *fake_data[int(len(fake_data)*0.9):]])
test_labels = np.array([*real_labels[int(len(real_labels)*0.9):], *fake_labels[int(len(fake_labels)*0.9):]], dtype=np.float32)

In [12]:
print("taille du train set : {}".format(len(train_data)))
print("taille du dev set : {}".format(len(dev_data)))
print("taille du test set : {}".format(len(test_data)))

taille du train set : 3206
taille du dev set : 401
taille du test set : 402


# Construction du vocabulaire

In [13]:
txt = [w for txt in train_data for w in txt.split()]
          
# token = nltk.word_tokenize(txt)
words = Counter(txt)
words = sorted(words, key=words.get, reverse=True)
words = ['_PAD','_UNK'] + words
vocab_size = len(words)
word2idx = {o:i for i,o in enumerate(words)}
word2idx["UNK"]=len(word2idx)
print("vocab size : ",vocab_size)

vocab size :  43464


In [14]:
train_data = [[word2idx[w] for w in news.split()] for news in train_data]
dev_data = [[word2idx[w] if w in word2idx else 0 for w in news.split()] for news in dev_data]
test_data = [[word2idx[w] if w in word2idx else 0 for w in news.split()] for news in test_data]

In [15]:
def pad_input(data, seq_len):
    features = np.zeros((len(data), seq_len), dtype=int)
    for ii, review in enumerate(data):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

# Padding
## on pad les input pour pouvoir créer des datasets pytorch et simplifier l'apprentissage (fonction de train plus claire, batch plus facile, etc)
on peut tester plusieurs taille de padding : input le plus grand, moyen, min, ou arbitraire du genre 200, 500, etc. On peut aussi ainsi réduire le temps de train.

In [16]:
# train_seq_len = max([len(s) for s in train_data])
# dev_seq_len = max([len(s) for s in dev_data])
# test_seq_len = max([len(s) for s in test_data])

# train_data = pad_input(train_data, train_seq_len)
# dev_data = pad_input(dev_data, dev_seq_len)
# test_data = pad_input(test_data, test_seq_len)

seq_len = 200


train_data = pad_input(train_data, seq_len)
dev_data = pad_input(dev_data, seq_len)
test_data = pad_input(test_data, seq_len)

In [17]:
train_data = TensorDataset(th.from_numpy(train_data).type(th.LongTensor), th.from_numpy(train_labels))
dev_data = TensorDataset(th.from_numpy(dev_data).type(th.LongTensor), th.from_numpy(dev_labels))
test_data = TensorDataset(th.from_numpy(test_data).type(th.LongTensor), th.from_numpy(test_labels))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# CBOW Classifieur

In [15]:
class L_CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(L_CBOW_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.l1 = nn.Linear(embedding_dim, 1)
        
        nn.init.xavier_uniform_(self.l1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.l1.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        inputs = th.sum(inputs, dim=1)
        out = th.sigmoid(self.l1(inputs))
        return out

In [16]:
class C_CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(C_CBOW_classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(1, 20, 5, 1)
        self.conv2 = nn.Conv1d(20, 50, 5, 1)
        self.dense1 = nn.Linear(450, 250)
        self.dense2 = nn.Linear(250, 64)
        self.dense3 = nn.Linear(64, 1)

    def forward(self, x):

        x = self.embeddings(x)
        x = th.sum(x, dim=0)
        x = x.view(1, 1,-1)
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2, 2)
        x = x.view(-1, 450)
        x = F.relu(self.dense1(x))
        x = F.relu(self.dense2(x))
        x = self.dense3(x)
        return th.sigmoid(x)

In [60]:
loss_fn = nn.BCELoss()
lr = 1e-2
l_cbow = L_CBOW_classifier(len(word2idx), 50)
train_accuracies = []
train_losses = []
dev_accuracies = []
dev_losses = []

In [17]:
def train_cbow(model, max_epochs=20):
    optim = th.optim.Adam(params=model.parameters(), lr =lr, weight_decay=1e-4)
#     model.train()
    
    for e in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0
        
        n=0
        for x, labels in train_loader:
            n+=batch_size
            preds = model(x)
            loss = loss_fn(preds, labels)

            train_mean_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optim.step()
            
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    train_accuracy+=1
                
            if(n%250==0):
                print("step : {}/{} ".format(n, len(train_data)))
                print("step accuracy : ", train_accuracy/n)
                print("step loss : ", train_mean_loss/n)
                
        ###Dev test
        for x, labels in dev_loader:
            preds = model(x)
            loss = loss_fn(preds, labels)
            dev_mean_loss+=loss.item()
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    dev_accuracy+=1
                    
        train_accuracies.append(train_accuracy/len(train_data))
        train_losses.append(train_mean_loss/len(train_data))
        dev_accuracies.append(dev_accuracy/len(dev_data))
        dev_losses.append(dev_mean_loss/len(dev_data))
        
        
        print("EPOCH {}".format(e+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")

In [62]:
train_cbow(l_cbow)

step : 1500/3206 
step accuracy :  0.6273333333333333
step loss :  0.029109583854675294
step : 3000/3206 
step accuracy :  0.6413333333333333
step loss :  0.02749831024805705
EPOCH 1
Train Accuracy :  0.6378665003119152
Dev Accuracy :  0.628428927680798
Train Mean loss :  0.02839727291671172
Dev Mean loss :  0.03711528017039311
----------------------------------------
step : 1500/3206 
step accuracy :  0.712
step loss :  0.018421828746795654
step : 3000/3206 
step accuracy :  0.7523333333333333
step loss :  0.014224151611328124
EPOCH 2
Train Accuracy :  0.7551466001247661
Dev Accuracy :  0.8678304239401496
Train Mean loss :  0.013802091255830514
Dev Mean loss :  0.008222416749321611
----------------------------------------
step : 1500/3206 
step accuracy :  0.8826666666666667
step loss :  0.005967991272608439
step : 3000/3206 
step accuracy :  0.8633333333333333
step loss :  0.005540942251682282
EPOCH 3
Train Accuracy :  0.8686837180286962
Dev Accuracy :  0.8478802992518704
Train Mean 

In [105]:
print(max(dev_accuracies))

0.970074812967581


In [18]:
def test_cbow(model, test_data, test_labels):
    acc = 0
    test_mean_loss = 0
    
    for x, labels in test_loader:
            
            preds = model(x)
            loss = loss_fn(preds, labels)
            test_mean_loss+=loss.item()
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    acc+=1
    print("Test accuracy : ", acc/len(test_data))
    print("Test mean loss : ", test_mean_loss/len(test_data))

In [64]:
test_cbow(l_cbow, test_data, test_labels)

Test accuracy :  0.9427860696517413
Test mean loss :  0.002640826340338484


  "Please ensure they have the same size.".format(target.size(), input.size()))


# Un classifieur RNN classique (GRU) 

In [18]:
class GRU_classifier(nn.Module):
    def __init__(self, nb_cells, hidden_size, vocab_size, embedding_dim, rnn_dropout, bidirectional=False):
        super(GRU_classifier, self).__init__()
        
        self.nb_cells = nb_cells
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, nb_cells, batch_first = True, dropout=rnn_dropout,bidirectional=bidirectional)

        if bidirectional:    
            self.fc = nn.Linear(2*hidden_size, 1)
        else:
            self.fc = nn.Linear(hidden_size, 1)
        self.is_bidirectional = bidirectional 
        self.dropout = nn.Dropout(0.2)
        nn.init.xavier_uniform_(self.fc.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.fc.bias.data)  # Xavier/Glorot init for tanh
        
            
    
    def forward(self, inputs, hidden):
        batch_size = inputs.size(0)
        embeds = self.embedding(inputs)
        rnn_out, hidden = self.gru(embeds, hidden)
        
        out = self.dropout(rnn_out)

        out = self.fc(out)
        out = th.sigmoid(out)
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        #hidden = th.Tensor(self.nb_cells, batch_size, self.hidden_size)
        if self.is_bidirectional:
            hidden = weight.new(2*self.nb_cells, batch_size, self.hidden_size).zero_()
        else:
            hidden = weight.new(self.nb_cells, batch_size, self.hidden_size).zero_()
        return hidden

# La même classe mais avec de l'attention 

In [72]:
class GRU_with_attention_classifier(nn.Module):
    def __init__(self, nb_cells, hidden_size, vocab_size, embedding_dim, rnn_dropout, bidirectional=False):
        super(GRU_with_attention_classifier, self).__init__()
        
        self.nb_cells = nb_cells
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, nb_cells, batch_first = True, dropout=rnn_dropout,bidirectional=bidirectional)
        
        if bidirectional:    
            self.fc = nn.Linear(2*hidden_size, 1)
        else:
            self.fc = nn.Linear(hidden_size, 1)
            
        self.is_bidirectional = bidirectional 
        self.dropout = nn.Dropout(0.2)
        nn.init.xavier_uniform_(self.fc.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.fc.bias.data)  # Xavier/Glorot init for tanh
        
        self.fc_attn1 = nn.Linear(hidden_size, 100)
        self.fc_attn2 = nn.Linear(100,1, bias=False)    
        self.fc_attn3 = nn.Linear(hidden_size, 1)
    
        self.softmax = nn.Softmax(dim=1)
        self.tanh = nn.Tanh()
        
    def forward(self, inputs, hidden):
        batch_size = inputs.size(0)
        embeds = self.embedding(inputs)
        rnn_out, hidden = self.gru(embeds, hidden)
        
        out = self.dropout(rnn_out)
        attention_weight = self.fc_attn1(out)
        attention_weight = self.tanh(attention_weight)
        attention_weight = self.softmax(self.fc_attn2(attention_weight))
        out = (out*attention_weight).sum(dim=1)
        out = self.fc_attn3(out)
        out = th.sigmoid(out)
        
        
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        #hidden = th.Tensor(self.nb_cells, batch_size, self.hidden_size)
        if self.is_bidirectional:
            hidden = weight.new(2*self.nb_cells, batch_size, self.hidden_size).zero_()
        else:
            hidden = weight.new(self.nb_cells, batch_size, self.hidden_size).zero_()
        return hidden

In [79]:
##Hyper-paramètres
nb_cells = 1
hidden_size = 32
embedding_dim = 10
learning_rate = 1e-2
loss_fn = nn.BCELoss()
m = GRU_with_attention_classifier(nb_cells, hidden_size, vocab_size, embedding_dim, 0.0, bidirectional=False)
#m = GRU_classifier(nb_cells, hidden_size, vocab_size, embedding_dim, 0.0, bidirectional=False)

print(m)

GRU_with_attention_classifier(
  (embedding): Embedding(43464, 10)
  (gru): GRU(10, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc_attn1): Linear(in_features=32, out_features=100, bias=True)
  (fc_attn2): Linear(in_features=100, out_features=1, bias=False)
  (fc_attn3): Linear(in_features=32, out_features=1, bias=True)
  (softmax): Softmax(dim=1)
  (tanh): Tanh()
)


In [80]:
def train_rnn(model, batch_size,lr, max_epochs=10, conv=False):
    optim = th.optim.Adam(params=model.parameters(), lr=lr, weight_decay=1e-4)
    model.train()
    best_dict = model.state_dict()
    best_acc = 0
    for i in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0

        n = 0 

        for x, labels in train_loader:
            loc_batch_size = x.size(0)
            h = model.init_hidden(loc_batch_size)
            
            optim.zero_grad()
            h = h.data
            
            n+=loc_batch_size
            
            preds, _ = model(x, h)  
            loss = loss_fn(preds, labels)
            h = h.detach()
            train_mean_loss+=loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optim.step()

            for pred, label in zip(preds, labels):
                if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                    train_accuracy+=1
        
            if(n%250==0):
                print("step : {}/{} ".format(n, len(train_data)))
                print("step accuracy : ", train_accuracy/n)
                print("step loss : ", train_mean_loss/n)
                
        for x, labels in dev_loader:
            loc_batch_size = x.size(0)
            h = model.init_hidden(loc_batch_size)
            preds, _ = model(x, h)
            loss = loss_fn(preds, labels)

            dev_mean_loss+=loss.item()
            
            for pred, label in zip(preds, labels):
                if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                    dev_accuracy+=1
        
        if(i==0):
            best_dict=model.state_dict()
            best_acc=dev_accuracy/len(dev_data)
        else:
            if(best_acc<(dev_accuracy/len(dev_data))):
                best_acc=dev_accuracy/len(dev_data)
                best_dict=model.state_dict()
                print("new best acc")
        
        print("EPOCH {}".format(i+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")
        
#     model.load_state_dict(best_dict)
#     acc = test(model, test_data, conv=conv)
#     stat_dict[model_name][4].append(acc)
#     print("Accuracy on test data : ", acc)
#     return best_dict

In [81]:
train_rnn(m, batch_size, learning_rate)

step : 250/3206 
step accuracy :  0.576
step loss :  0.013595621824264527
step : 500/3206 
step accuracy :  0.572
step loss :  0.013630937457084656
step : 750/3206 
step accuracy :  0.5813333333333334
step loss :  0.013452350695927939
step : 1000/3206 
step accuracy :  0.591
step loss :  0.013351022362709046
step : 1250/3206 
step accuracy :  0.6064
step loss :  0.013296894598007203
step : 1500/3206 
step accuracy :  0.6166666666666667
step loss :  0.013116304198900858
step : 1750/3206 
step accuracy :  0.6165714285714285
step loss :  0.01305451910836356
step : 2000/3206 
step accuracy :  0.6205
step loss :  0.012949522465467453
step : 2250/3206 
step accuracy :  0.6248888888888889
step loss :  0.01285066941049364
step : 2500/3206 
step accuracy :  0.6344
step loss :  0.01267199878692627
step : 2750/3206 
step accuracy :  0.6389090909090909
step loss :  0.012607661659067328
step : 3000/3206 
step accuracy :  0.649
step loss :  0.01247554690639178


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


EPOCH 1
Train Accuracy :  0.6512788521522146
Dev Accuracy :  0.7955112219451371
Train Mean loss :  0.012593304476958101
Dev Mean loss :  0.01032596856281347
----------------------------------------
step : 250/3206 
step accuracy :  0.78
step loss :  0.010204473853111268
step : 500/3206 
step accuracy :  0.806
step loss :  0.00952898108959198
step : 750/3206 
step accuracy :  0.8213333333333334
step loss :  0.009120239893595377
step : 1000/3206 
step accuracy :  0.818
step loss :  0.009129592806100845
step : 1250/3206 
step accuracy :  0.808
step loss :  0.009299426054954528
step : 1500/3206 
step accuracy :  0.806
step loss :  0.009323635756969453
step : 1750/3206 
step accuracy :  0.8017142857142857
step loss :  0.009308373042515345
step : 2000/3206 
step accuracy :  0.803
step loss :  0.00916855876147747
step : 2250/3206 
step accuracy :  0.8044444444444444
step loss :  0.008987234036127726
step : 2500/3206 
step accuracy :  0.808
step loss :  0.008861706125736236
step : 2750/3206 
s

EPOCH 8
Train Accuracy :  0.9912663755458515
Dev Accuracy :  0.9650872817955112
Train Mean loss :  0.0006448506819078767
Dev Mean loss :  0.0045491936134680775
----------------------------------------
step : 250/3206 
step accuracy :  0.988
step loss :  0.001235723470337689
step : 500/3206 
step accuracy :  0.984
step loss :  0.0013283347520045936
step : 750/3206 
step accuracy :  0.984
step loss :  0.0011115919385726253
step : 1000/3206 
step accuracy :  0.986
step loss :  0.000930727364262566
step : 1250/3206 
step accuracy :  0.9872
step loss :  0.0008918981378898025
step : 1500/3206 
step accuracy :  0.9886666666666667
step loss :  0.0007858748990111054
step : 1750/3206 
step accuracy :  0.9885714285714285
step loss :  0.0008095772614968675
step : 2000/3206 
step accuracy :  0.9895
step loss :  0.0007533612898550928
step : 2250/3206 
step accuracy :  0.9897777777777778
step loss :  0.0007497105799201462
step : 2500/3206 
step accuracy :  0.9904
step loss :  0.0007440367992967368
st

In [82]:
def test_rnn(model, test_data, test_labels):
    acc = 0
    test_mean_loss = 0
    
    for x, labels in test_loader:
            loc_batch_size = x.size(0)
            h = model.init_hidden(loc_batch_size)
            preds, _ = model(x, h)
            loss = loss_fn(preds, labels)
            test_mean_loss+=loss.item()
            for pred, label in zip(preds, labels):
                if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                    acc+=1
    print("Test accuracy : ", acc/len(test_data))
    print("Test mean loss : ", test_mean_loss/len(test_data))

In [83]:
test_rnn(m, test_data, test_labels)

Test accuracy :  0.9651741293532339
Test mean loss :  0.003409405364029443


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
