In [1]:
import re
import csv
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import nltk
from collections import Counter 

# Lecture des données et construction des train, dev et test sets

In [2]:
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [28]:
##On transforme les 3 colonnes en 1 : url, titre et le texte deviennent un seul élément
def read_data(filename):
    data = []
    labels = []
    with open(filename, encoding="utf8", newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            txt = clean_str(row[2])
            txt = " ".join(txt.split("\n"))
#             d = clean_str(row[0])+" "+clean_str(row[1])
            labels.append(row[3])
            data.append(txt)    
    data.pop(0)
    labels.pop(0)
    labels = [float(l) for l in labels]
    return np.array(data), np.array(labels)

In [29]:
data, labels = read_data("data.csv")

In [30]:
print(data[0])

image copyright getty images on sunday morning , donald trump went off on a twitter tirade against a member of his own party this , in itself , is n't exactly huge news it 's far from the first time the president has turned his rhetorical cannons on his own ranks this time , however , his attacks were particularly biting and personal he essentially called tennessee senator bob corker , the chair of the powerful senate foreign relations committee , a coward for not running for re election he said mr corker begged for the president 's endorsement , which he refused to give he wrongly claimed that mr corker 's support of the iranian nuclear agreement was his only political accomplishment unlike some of his colleagues , mr corker free from having to worry about his immediate political future did n't hold his tongue skip twitter post by senbobcorker it 's a shame the white house has become an adult day care center someone obviously missed their shift this morning senator bob corker \( senbo

In [31]:
print(data.shape)
fake_data = data[labels==0]
real_data = data[labels==1]
fake_labels = labels[labels==0]
real_labels = labels[labels==1]
print("Nombre de fake news : {}".format(len(fake_data)))
print("Nombre de news correctes : {}".format(len(real_data)))


(4009,)
Nombre de fake news : 2137
Nombre de news correctes : 1872


In [32]:
print(fake_data.shape)

(2137,)


In [57]:
train_data = np.array([*real_data[:int(len(real_data)*0.8)],*fake_data[:int(len(fake_data)*0.8)]])
train_labels = np.array([*real_labels[:int(len(real_labels)*0.8)],*fake_labels[:int(len(fake_labels)*0.8)]], dtype=np.float32)

dev_data = np.array([*real_data[int(len(real_data)*0.8):int(len(real_data)*0.9)], *fake_data[int(len(fake_data)*0.8):int(len(fake_data)*0.9)]])
dev_labels = np.array([*real_labels[int(len(real_labels)*0.8):int(len(real_labels)*0.9)], *fake_labels[int(len(fake_labels)*0.8):int(len(fake_labels)*0.9)]], dtype=np.float32)

test_data = np.array([*real_data[int(len(real_data)*0.9):], *fake_data[int(len(fake_data)*0.9):]])
test_labels = np.array([*real_labels[int(len(real_labels)*0.9):], *fake_labels[int(len(fake_labels)*0.9):]], dtype=np.float32)

In [34]:
print("taille du train set : {}".format(len(train_data)))
print("taille du dev set : {}".format(len(dev_data)))
print("taille du test set : {}".format(len(test_data)))

taille du train set : 3206
taille du dev set : 401
taille du test set : 402


['i', 'm', 'a', 'g', 'e', ' ', 'c', 'o', 'p', 'y', 'r', 'i', 'g', 'h', 't', ' ', 'g', 'e', 't', 't', 'y', ' ', 'i', 'm', 'a', 'g', 'e', 's', ' ', 'o', 'n', ' ', 's', 'u', 'n', 'd', 'a', 'y', ' ', 'm', 'o', 'r', 'n', 'i', 'n', 'g', ' ', ',', ' ', 'd', 'o', 'n', 'a', 'l', 'd', ' ', 't', 'r', 'u', 'm', 'p', ' ', 'w', 'e', 'n', 't', ' ', 'o', 'f', 'f', ' ', 'o', 'n', ' ', 'a', ' ', 't', 'w', 'i', 't', 't', 'e', 'r', ' ', 't', 'i', 'r', 'a', 'd', 'e', ' ', 'a', 'g', 'a', 'i', 'n', 's', 't', ' ', 'a', ' ', 'm', 'e', 'm', 'b', 'e', 'r', ' ', 'o', 'f', ' ', 'h', 'i', 's', ' ', 'o', 'w', 'n', ' ', 'p', 'a', 'r', 't', 'y', ' ', 't', 'h', 'i', 's', ' ', ',', ' ', 'i', 'n', ' ', 'i', 't', 's', 'e', 'l', 'f', ' ', ',', ' ', 'i', 's', ' ', 'n', "'", 't', ' ', 'e', 'x', 'a', 'c', 't', 'l', 'y', ' ', 'h', 'u', 'g', 'e', ' ', 'n', 'e', 'w', 's', ' ', 'i', 't', ' ', "'", 's', ' ', 'f', 'a', 'r', ' ', 'f', 'r', 'o', 'm', ' ', 't', 'h', 'e', ' ', 'f', 'i', 'r', 's', 't', ' ', 't', 'i', 'm', 'e', ' ', 't',

# Construction du vocabulaire

In [55]:
txt = ""
for saison in os.listdir(directory):
    dir_saison = "{}/{}".format(directory, saison)
    for ep in os.listdir(dir_saison):
        adr = "{}/{}".format(dir_saison,ep)
        with open(adr, 'r') as f:
            data = f.read()
            txt+=data

            
token = nltk.word_tokenize(txt)
words = Counter(token)
words = sorted(words, key=words.get, reverse=True)
words = ['_PAD','_UNK'] + words
vocab_size = len(words)
word2idx = {o:i for i,o in enumerate(words)}
word2idx["UNK"]=len(word2idx)
print("vocab size : ",vocab_size)

TypeError: cannot use a string pattern on a bytes-like object

In [None]:
[w for w in txt for txt in train_data]
[w for txt.split() in train_data for w in txt]
[x for b in a for x in b]

In [None]:
print(word2idx)

In [48]:
train_data = [[word2idx[w] for w in s] for s in train_data]
dev_data = [[word2idx[w] if w in word2idx else 0 for w in s] for s in dev_data]
test_data = [[word2idx[w] if w in word2idx else 0 for w in s] for s in test_data]

KeyError: ' '

In [45]:
print(train_data[0])

[170, 473, 953, 448, 11, 231, 800, 3, 389, 71, 340, 141, 11, 7, 213, 13668, 98, 7, 781, 6, 28, 219, 331, 22, 3, 8, 964, 3, 10, 138, 1349, 1029, 83, 13, 30, 330, 31, 2, 67, 79, 2, 100, 29, 870, 28, 16815, 23000, 11, 28, 219, 4326, 22, 79, 3, 233, 3, 28, 918, 57, 1475, 9716, 5, 843, 15, 2621, 221, 1564, 1476, 1913, 1076, 3, 2, 4012, 6, 2, 844, 886, 405, 1467, 576, 3, 7, 13669, 12, 36, 323, 12, 192, 449, 15, 24, 93, 1076, 16816, 12, 2, 100, 30, 6528, 3, 52, 15, 2138, 4, 381, 15, 11681, 1661, 9, 93, 1076, 30, 314, 6, 2, 2316, 430, 772, 17, 28, 96, 310, 19421, 2704, 77, 6, 28, 2906, 3, 93, 1076, 271, 31, 477, 4, 3357, 49, 28, 2580, 310, 507, 118, 138, 1041, 28, 10952, 4013, 213, 295, 21, 23001, 13, 30, 7, 4406, 2, 230, 203, 29, 425, 35, 4014, 160, 878, 490, 879, 1356, 1270, 41, 2907, 22, 800, 1476, 1913, 1076, 23, 23001, 20, 185, 253, 3, 97, 242, 9, 17, 138, 2, 214, 6, 13, 3, 360, 15, 140, 1638, 16, 2, 53, 152, 145, 5, 248, 410, 2, 100, 25, 13, 146, 27, 217, 1914, 7430, 31, 2, 1564, 1476, 3

# CBOW Classifieur

In [17]:
class L_CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(L_CBOW_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.l1 = nn.Linear(embedding_dim, 1)
        
        nn.init.xavier_uniform_(self.l1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.l1.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        inputs = th.sum(inputs, dim=0)
        out = th.sigmoid(self.l1(inputs))
        return out

In [279]:
class C_CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(C_CBOW_classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(1, 20, 5, 1)
        self.conv2 = nn.Conv1d(20, 50, 5, 1)
        self.dense1 = nn.Linear(450, 250)
        self.dense2 = nn.Linear(250, 64)
        self.dense3 = nn.Linear(64, 1)

    def forward(self, x):

        x = self.embeddings(x)
        x = th.sum(x, dim=0)
        x = x.view(1, 1,-1)
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2, 2)
        x = x.view(-1, 450)
        x = F.relu(self.dense1(x))
        x = F.relu(self.dense2(x))
        x = self.dense3(x)
        return th.sigmoid(x)

In [38]:
loss_fn = nn.BCELoss()
lr = 1e-2
l_cbow = L_CBOW_classifier(len(word2idx), 50)
train_accuracies = []
train_losses = []
dev_accuracies = []
dev_losses = []

In [39]:
def train_cbow(model, max_epochs=20):
    optim = th.optim.Adam(params=model.parameters(), lr =lr, weight_decay=1e-4)
#     model.train()
    idx_train = np.arange(len(train_data))
    idx_dev = np.arange(len(dev_data))
    
    for e in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0
        
        np.random.shuffle(idx_train)
        np.random.shuffle(idx_dev)
        n=0
        for i in idx_train:
            s = th.tensor(train_data[i])
            y = train_labels[i]
            n+=1
            label = th.tensor([y])
            pred = model(s)
            loss = loss_fn(pred, label)

            train_mean_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optim.step()
            
            if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                train_accuracy+=1
                
            if(n%250==0):
                print("step : {}/{} ".format(n, len(train_data)))
                print("step accuracy : ", train_accuracy/n)
                print("step loss : ", train_mean_loss/n)
                
        ###Dev test
        for i in idx_dev:
            s = th.tensor(dev_data[i])
            y = dev_labels[i]
            label = th.tensor([y])
            pred = model(s)
            loss = loss_fn(pred, label)
            dev_mean_loss+=loss.item()
            if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                dev_accuracy+=1
        train_accuracies.append(train_accuracy/len(train_data))
        train_losses.append(train_mean_loss/len(train_data))
        dev_accuracies.append(dev_accuracy/len(dev_data))
        dev_losses.append(dev_mean_loss/len(dev_data))
        
        
        print("EPOCH {}".format(e+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")

In [40]:
train_cbow(l_cbow)

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [25]:
def test(model, test_data, test_labels):
    acc = 0
    test_mean_loss = 0
    
    for i in range(len(test_data)):
            s = th.tensor(test_data[i])
            y = test_labels[i]
            label = th.tensor([y])
            pred = model(s)
            loss = loss_fn(pred, label)
            test_mean_loss+=loss.item()
            if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                acc+=1
    print("Test accuracy : ", acc/len(test_data))
    print("Test mean loss : ", test_mean_loss/len(test_data))

In [27]:
test(l_cbow, test_data, test_labels)

Test accuracy :  0.9975124378109452
Test mean loss :  0.00988722845005508


# Un classifieur RNN classique (LSTM ou GRU) 

In [149]:
class RNN_classifier(nn.Module):
    def __init__(self, nb_cells, hidden_size, vocab_size, embedding_dim, rnn_dropout, is_lstm=False):
        super(RNN_classifier, self).__init__()
        
        self.nb_cells = nb_cells
        self.hidden_size = hidden_size
        self.is_lstm = is_lstm
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if is_lstm:
            self.rnn = nn.LSTM(embedding_dim, hidden_size, nb_cells, batch_first = True, dropout=rnn_dropout)
        else:
            self.rnn = nn.GRU(embedding_dim, hidden_size, nb_cells, batch_first = True, dropout=rnn_dropout)
            
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.2)
        nn.init.xavier_uniform_(self.fc.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.fc.bias.data)  # Xavier/Glorot init for tanh
        
    
    def forward(self, inputs, hidden):
        batch_size = inputs.size(0)
        embeds = self.embedding(inputs).unsqueeze(0)
#         embeds = embeds.view(batch_size,embeds.shape[0], embeds.shape[1])
        rnn_out, hidden = self.rnn(embeds, hidden)
        rnn_out = rnn_out.contiguous().view(-1, self.hidden_size)
        
        out = self.dropout(rnn_out)
        out = self.fc(out)
        out = th.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out[-1], hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = th.Tensor(self.nb_cells, batch_size, self.hidden_size)
        if self.is_lstm:
            hidden = (weight.new(self.nb_cells, batch_size, self.hidden_size).zero_(),
                      weight.new(self.nb_cells, batch_size, self.hidden_size).zero_())
        else:
            hidden = weight.new(self.nb_cells, batch_size, self.hidden_size).zero_()
            
        return hidden
    

In [167]:
##Hyper-paramètres
nb_cells = 1
hidden_size = 32
embedding_dim = 10
learning_rate = 1e-2
fn = nn.BCELoss()
m = RNN_classifier(nb_cells, hidden_size, vocab_size, embedding_dim, 0.0)
print(m)

RNN_classifier(
  (embedding): Embedding(45809, 10)
  (rnn): GRU(10, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [171]:
def train_rnn(model, batch_size,lr, max_epochs=10, conv=False):
    optim = th.optim.Adam(params=model.parameters(), lr=lr, weight_decay=1e-4)
    model.train()
    best_dict = model.state_dict()
    best_acc = 0
    for i in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0
#         train_batches = get_batches(batch_size, train_data, train_labels)
#         dev_batches = get_batches(batch_size, dev_data, dev_labels)
        h = model.init_hidden(batch_size)
        n = 0 

        for x, y in zip(train_data, train_labels):
            optim.zero_grad()
            h = h.data
            data = th.tensor(x)
            n+=1
            label = th.tensor([y])
            pred, h = model(data, h)  
            
            loss = fn(pred, label)
            h = h.detach()
            train_mean_loss+=loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optim.step()

            if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                train_accuracy+=1
        
            if(n%250==0):
                print("step : {}/{} ".format(n, len(train_data)))
                print("step accuracy : ", train_accuracy/n)
                print("step loss : ", train_mean_loss/n)
                
        for x, y in dev_batches:
            
            data = th.tensor(x)
            label = th.tensor([y])
            pred, h = model(data, h)
            loss = fn(pred, label)

            dev_mean_loss+=loss.item()
            

            if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                dev_accuracy+=1
        
        if(i==0):
            best_dict=model.state_dict()
            best_acc=dev_accuracy/len(dev_data)
        else:
            if(best_acc<(dev_accuracy/len(dev_data))):
                best_acc=dev_accuracy/len(dev_data)
                best_dict=model.state_dict()
                print("new best acc")
        
        print("EPOCH {}".format(i+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")
        
#     model.load_state_dict(best_dict)
#     acc = test(model, test_data, conv=conv)
#     stat_dict[model_name][4].append(acc)
#     print("Accuracy on test data : ", acc)
#     return best_dict

In [172]:
train_rnn(m, 1, learning_rate)

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


step : 250/3206 
step accuracy :  1.0
step loss :  0.00048478447171692094
step : 500/3206 
step accuracy :  1.0
step loss :  0.00026606387300762434
step : 750/3206 
step accuracy :  1.0
step loss :  0.00019344001689417686
step : 1000/3206 
step accuracy :  1.0
step loss :  0.00015683772761667568
step : 1250/3206 
step accuracy :  1.0
step loss :  0.00013501516601554614
step : 1500/3206 
step accuracy :  1.0
step loss :  0.00012109455851244395
step : 1750/3206 
step accuracy :  0.9965714285714286
step loss :  0.01319653223781331
step : 2000/3206 
step accuracy :  0.997
step loss :  0.011594657813240702
step : 2250/3206 
step accuracy :  0.9973333333333333
step loss :  0.010330106699013818
step : 2500/3206 
step accuracy :  0.9976
step loss :  0.009312117644745377
step : 2750/3206 
step accuracy :  0.9978181818181818
step loss :  0.008476053057279304


KeyboardInterrupt: 