In [35]:
import re
import csv
import torch as th
import torch.nn as nn
import torch.functional as F
import numpy as np
import nltk
from collections import Counter 

# Lecture des données et construction des train, dev et test sets

In [11]:
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [41]:
##On transforme les 3 colonnes en 1 : url, titre et le texte deviennent un seul élément
def read_data(filename):
    data = []
    labels = []
    with open(filename, encoding="utf8", newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            txt = clean_str(row[2])
            txt = " ".join(txt.split("\n"))
            d = clean_str(row[0])+" "+clean_str(row[1])+" "+txt 
            labels.append(row[3])
            data.append(d)    
    data.pop(0)
    labels.pop(0)
    labels = [int(l) for l in labels]
    return np.array(data), np.array(labels)

In [56]:
def get_batches(batch_size, dataset, labels):
    for i in range(0, len(dataset[0]), batch_size):
        yield dataset[i:i+batch_size], labels[i:i+batch_size]

In [42]:
data, labels = read_data("data.csv")

In [43]:
print(data.shape)
fake_data = data[labels==0]
real_data = data[labels==1]
fake_labels = labels[labels==0]
real_labels = labels[labels==1]
print("Nombre de fake news : {}".format(len(fake)))
print("Nombre de news correctes : {}".format(len(real)))


(4009,)
Nombre de fake news : 2137
Nombre de news correctes : 1872


In [44]:
print(fake_data.shape)

(2137,)


In [57]:
train_data = [*real_data[:int(len(real_data)*0.8)],*fake_data[:int(len(fake_data)*0.8)]]
train_labels = np.array([1.0 for i in range(int(len(train_data)/2))]+[0.0 for i in range(int(len(train_data)/2))])

dev_data = [*real_data[int(len(real_data)*0.8):int(len(real_data)*0.9)], *fake_data[int(len(fake_data)*0.8):int(len(fake_data)*0.9)]]
dev_labels = np.array([1.0 for i in range(int(len(dev_data)/2))]+[0.0 for i in range(int(len(dev_data)/2))])

test_data = [*real_data[int(len(real_data)*0.9):], *fake_data[int(len(fake_data)*0.9):]]
test_labels = np.array([1.0 for i in range(int(len(test_data)/2))]+[0.0 for i in range(int(len(test_data)/2))])

In [46]:
print("taille du train set : {}".format(len(train_data)))
print("taille du dev set : {}".format(len(dev_data)))
print("taille du test set : {}".format(len(test_data)))

taille du train set : 3206
taille du dev set : 401
taille du test set : 402


# Construction du vocabulaire

In [53]:
train_txt = ""
for txt in train_data:
    train_txt+=  txt+" "
        
token = train_txt.lower().split()
words = Counter(token)
words = sorted(words, key=words.get, reverse=True)
vocab_size = len(words)
word2idx = {o:i for i,o in enumerate(words)}
word2idx["UNK"]=len(word2idx)
print("vocab size : ",vocab_size)

vocab size :  45809


In [54]:
print(word2idx)



In [59]:
int_data = []
for txt in train_data:
    txt = txt.lower().split()
    txt = [word2idx[w] for w in txt]
    int_data.append(txt)
train_data = int_data

int_data = []
for txt in dev_data:
    txt = txt.lower().split()
    txt = [word2idx[w] if w in word2idx else word2idx["UNK"] for w in txt]
    int_data.append(txt)
dev_data = int_data

int_data = []
for txt in test_data:
    txt = txt.lower().split()
    txt = [word2idx[w] if w in word2idx else word2idx["UNK"] for w in txt]
    int_data.append(txt)
test_data = int_data

In [150]:
class LSTM_classifier(nn.Module):
    def __init__(self, nb_cells, hidden_size, vocab_size, embedding_dim):
        super(LSTM_classifier, self).__init__()
        
        self.nb_cells = nb_cells
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, nb_cells, batch_first = True)
        self.l1 = nn.Linear(hidden_size, 1)
        
        nn.init.xavier_uniform_(self.l1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.l1.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, inputs, hidden):
        embeds = self.embedding(inputs)
        embeds = embeds.view(1,embeds.shape[0], embeds.shape[1])
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_size)
        
        out = self.l1(lstm_out)
        out = out.view(1, -1)
        out = out[:,-1]
        return th.sigmoid(out)
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.nb_cells, batch_size, self.hidden_size).zero_(),
                      weight.new(self.nb_cells, batch_size, self.hidden_size).zero_())
        return hidden

In [157]:
##Hyper-paramètres
nb_cells = 1
hidden_size = 64
embedding_dim = 15
learning_rate = 1e-2
fn = nn.BCELoss()
m = LSTM_classifier(nb_cells, hidden_size, vocab_size, embedding_dim)

In [156]:
def train(model, batch_size,lr, max_epochs=10, conv=False):
    optim = th.optim.SGD(params=model.parameters(), lr=lr, weight_decay=1e-4)
    model.train()
    best_dict = model.state_dict()
    best_acc = 0
    for i in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0
        train_batches = get_batches(batch_size, train_data, train_labels)
        dev_batches = get_batches(batch_size, dev_data, dev_labels)
        h = model.init_hidden(1)

        for x, y in zip(train_data, train_labels):
            data = th.tensor(x)
            label = th.tensor([y])
            pred = model(data, h)
            loss = fn(pred, label)

            train_mean_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            optim.step()


            if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                train_accuracy+=1
        
        for x, y in dev_batches:
            
            data = th.tensor(x)
            label = th.tensor([y])
            pred = model(data, h)
           
            loss = fn(pred, label)

            dev_mean_loss+=loss.item()
            

            if((label==1 and pred>0.5) or(label==0 and pred<0.5)):
                train_accuracy+=1
        
        if(i==0):
            best_dict=model.state_dict()
            best_acc=dev_accuracy/len(dev_data[0])
        else:
            if(best_acc<(dev_accuracy/len(dev_data[0]))):
                best_acc=dev_accuracy/len(dev_data[0])
                best_dict=model.state_dict()
                print("new best acc")
        
        print("EPOCH {}".format(i+1))
        print("Train Accuracy : ",train_accuracy/len(train_data[0]))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data[0]))
        print("Train Mean loss : ",train_mean_loss/len(train_data[0]))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data[0]))
        print("----------------------------------------")
        
#     model.load_state_dict(best_dict)
#     acc = test(model, test_data, conv=conv)
#     stat_dict[model_name][4].append(acc)
#     print("Accuracy on test data : ", acc)
#     return best_dict

In [158]:
train(m, 25, learning_rate, max_epochs=20)

KeyboardInterrupt: 