## Read Data

In [1]:
import pandas as pd

#Read data
PATH = r'Fake.csv'
df = pd.read_csv(PATH, delimiter="," ,header=0)

#Drop na and drop_duplicates
print("Samples: ", df.size)
df = df.dropna()
df = df.drop_duplicates()
print("New samples size: ",df.size)
df.head()

Samples:  93924
New samples size:  93912


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [2]:
print(df['title'].size)
print(df['text'].size)
23478 + 23478

23478
23478


46956

In [3]:
#concate title and text
#new_sr = df['title'].append(df['text'])
new_sr = df['title']

In [4]:
type(new_sr)

pandas.core.series.Series

In [5]:
texts = new_sr.tolist() #Convert Series to list

In [6]:
raw_text = ''
for i in range(len(texts)):
    raw_text += texts[i] + " "

In [7]:
#Check texts
print(len(texts))
texts

23478


[' Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing',
 ' Drunk Bragging Trump Staffer Started Russian Collusion Investigation',
 ' Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People ‘In The Eye’',
 ' Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES)',
 ' Pope Francis Just Called Out Donald Trump During His Christmas Speech',
 ' Racist Alabama Cops Brutalize Black Boy While He Is In Handcuffs (GRAPHIC IMAGES)',
 ' Fresh Off The Golf Course, Trump Lashes Out At FBI Deputy Director And James Comey',
 ' Trump Said Some INSANELY Racist Stuff Inside The Oval Office, And Witnesses Back It Up',
 ' Former CIA Director Slams Trump Over UN Bullying, Openly Suggests He’s Acting Like A Dictator (TWEET)',
 ' WATCH: Brand-New Pro-Trump Ad Features So Much A** Kissing It Will Make You Sick',
 ' Papa John’s Founder Retires, Figures Out Racism Is Bad For Business',
 ' WATCH: Paul Ryan Just Told Us He Doesn’t Care About S

## BoW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b")
vectorizer.fit(texts)
print("Vocabulary:", vectorizer.vocabulary_)
bag_of_words = vectorizer.transform(texts)
print("Bag of words:", bag_of_words.toarray())

## Train model CBOW

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from collections import Counter


In [None]:
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
  
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {torch.cuda.current_device()}")
        
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

In [None]:
CONTEXT_SIZE = 2  # number of context words to use , CONTEXT_WINDOW_SIZE là số từ bao xung quanh từ đó 
EMBEDDING_DIM = 128  # dimensionality of word embeddings

In [None]:
tokens = []
# Tokenize the text and create vocabulary
for i in range(len(texts)):
    tokens += texts[i].lower().split() #split to tokens
word_counts = Counter(tokens) #Number of words in raw_text
vocab = list(set(tokens)) #create vocab

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []

for i in range(CONTEXT_SIZE, len(tokens) - CONTEXT_SIZE):
    context = [tokens[i - 2], tokens[i - 1], tokens[i + 1], tokens[i + 2]]
    target = tokens[i]
    data.append((context, target))

In [None]:
len(vocab)

In [None]:
# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function = nn.ReLU()
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embedding(inputs).sum(dim=0).view(1, -1)
        out = self.linear1(embeds)
        out = self.activation_function(out)
        out = self.linear2(out)
        return out

In [None]:
# Train the CBOW model
cbow_model = CBOW(len(vocab), EMBEDDING_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(cbow_model.parameters(), lr=0.1)
for epoch in range(3):
    total_loss = 0
    for context, target in data:
        context_idxs = [word_to_ix[w] for w in context]
        context_var = Variable(torch.LongTensor(context_idxs))
        cbow_model.zero_grad()
        log_probs = cbow_model(context_var)
        loss = loss_function(log_probs, Variable(torch.LongTensor([word_to_ix[target]])))
        loss.backward()
        optimizer.step()
        total_loss += loss.data
        #print("Loss data: ", loss.data)
    print('Epoch:', epoch, 'Loss:', total_loss)

# Get word embeddings
word_embeddings = cbow_model.embedding.weight.data.numpy()
#print(word_embeddings[word_to_ix['process']])

## test model use cuda

In [8]:
# This file Train a word Embedding Using CBOW
# Implementing refer to official tutorial of pytorch
# https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.datasets import WikiText2
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from tensorboardX import SummaryWriter
from utils import print_k_nearest_neighbour
# download WikiText2
# WikiText2.download('./corpus')
import nltk
nltk.download('punkt')

class WikiText2DataSet(Dataset):
    """
    We are just training word embeddings, what we need is just text,
    And thus we do not perform train, val, test splitting and sort of
    things. You can change the data file to whatever you want as long
    as it's plain text, and it's not that big.
    It's toy implementation, train on rather small dataset,
    so we don't restrict vocabulary size.
    """
    def __init__(self, data, window_size=2):
        """
        :param data_file_path: path for the plain text file
        :param ngram:  language model n-grams
        """
        words_tokenized = word_tokenize(data)

        self.context_target =  [([words_tokenized[i-(j+1)] for j in range(window_size)] +\
                                 [words_tokenized[i+(j+1)] for j in range(window_size)],
                                words_tokenized[i])
                                for i in range(window_size, len(words_tokenized)-window_size)]

        self.vocab = Counter(words_tokenized)
        self.word_to_idx = {word_tuple[0]: idx for idx, word_tuple in enumerate(self.vocab.most_common())}
        self.idx_to_word = list(self.word_to_idx.keys())
        self.vocab_size = len(self.vocab)
        self.window_size = window_size

    def __getitem__(self, idx):
        context = torch.tensor([self.word_to_idx[w] for w in self.context_target[idx][0]])
        target = torch.tensor([self.word_to_idx[self.context_target[idx][1]]])
        return context, target

    def __len__(self):
        return len(self.context_target)

class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, window_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.window_size = window_size

    def forward(self, inputs):

        embeds = torch.sum(self.embeddings(inputs), dim=1) # [200, 4, 50] => [200, 50]
        # embeds = self.embeddings(inputs).view((batch_size, -1))
        out = self.linear(embeds) # nonlinear + projection
        log_probs = F.log_softmax(out, dim=1) # softmax compute log probability

        return log_probs


WINDOWS_SIZE = 2
EMBEDDING_DIM = 50
BATCH_SIZE = 500
NUM_EPOCH = 20

# I think torchtext is really hard to use
# It's a toy example, so you can use any plain text dataset
data = raw_text
# data_file_path = './corpus/Pride-and-Prejudice.txt'

data = WikiText2DataSet(data=data)
model = CBOW(len(data.vocab), EMBEDDING_DIM, WINDOWS_SIZE)
# optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_function = nn.NLLLoss()
losses = []
cuda_available = torch.cuda.is_available()
data_loader = DataLoader(data, batch_size=BATCH_SIZE)

# Writer
writer = SummaryWriter('./logs/CBOW')

for epoch in range(NUM_EPOCH):
    total_loss = 0
    for context, target in tqdm(data_loader):
        # context: torch.Size([10, 4])
        # target:  torch.Size([10, 1])
        if context.size()[0] != BATCH_SIZE:
            continue
        # deal with last several batches

        if cuda_available:
            context = context.cuda()
            target = target.squeeze(1).cuda()
            print(target.shape)
            model = model.cuda()

        model.zero_grad()
        log_probs = model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    losses.append(total_loss)
    writer.add_scalar('Train/Loss', total_loss, epoch)

    # TODO add visualization of embedding
    # writer.add_embedding(model.embeddings.weight, metadata=data.word_to_idx.keys(), global_step=epoch)
    # It should work, but unfortunately not. see this issue, it seems like a tensorboard 1.11.0's
    # https://github.com/tensorflow/tensorboard/issues/1480

    print('total_loss:',total_loss)

writer.close()

# print some results
embed_matrix = model.embeddings.weight.detach().cpu().numpy()
print_k_nearest_neighbour(embed_matrix, data.word_to_idx['she'], 10, list(data.word_to_idx.keys()))
print_k_nearest_neighbour(embed_matrix, data.word_to_idx['is'], 10, list(data.word_to_idx.keys()))
print_k_nearest_neighbour(embed_matrix, data.word_to_idx['good'], 10, list(data.word_to_idx.keys()))

# TODO, refine the models, take models, and dataset into one class file respectively

OSError: [WinError 127] The specified procedure could not be found