In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [2]:
df = pd.read_excel('fake_bananas_structuredKopie_4.xlsx', index_col=None, header=None)
df = df.drop([0])
print(df.shape)
df.head()

(3231, 4)


Unnamed: 0,0,1,2,3
1,Tourist dubbed ‘Spider Man’ after spider burro...,0,agree,A small meteorite crashed into a wooded area i...
2,Giant 8ft 9in catfish weighing 19 stone caught...,0,disagree,A small meteorite crashed into a wooded area i...
3,Enormous 20-stone catfish caught with fishing ...,0,agree,A small meteorite crashed into a wooded area i...
4,ISIS Militants Allegedly Contracted Ebola,5,agree,(NEWSER) – Wonder how long a Quarter Pounder w...
5,Matt Taibbi on leave of absence from First Loo...,5,disagree,(NEWSER) – Wonder how long a Quarter Pounder w...


In [3]:
# takes in string & returns a cleaned string of all non-stop-words
def preprocess(text, lemmatizer = WordNetLemmatizer()):
    sw = set(stopwords.words('english'))
    text = re.sub(r'[^\w\s]', '', text).lower()
    s = ""
    for word in text.split():
        if word not in sw:
                s += (lemmatizer.lemmatize(word) + " ")
    return s

# creates a vocabulary without stop words
# gets called in StanceDataset().__init__
def create_vocab(df):
    print("creating vocabulary...")
    vocab = []
    for i in range(len(df)-1):
            vocab.append(preprocess(df[0][i+1]))
            vocab.append(preprocess(df[3][i+1]))
    vocab_df = pd.DataFrame(vocab)
    
    counter = Counter(" ".join(vocab_df[0]).split()).most_common(5000)
    counter_df = pd.DataFrame(counter)
    return counter_df

#for testing
#v = create_vocab(df)
#print(len(v))
#print(v.head())

In [41]:
# test block to tinker with vectorizer, add to custom dataloader once finished
def fit_vectorizers(dataset):
    print('fitting vectorizer...')
    x = dataset.iloc[:,0:1]
    x2 = dataset.iloc[:,2:3]
    # create array of all texts in dataset
    data = [(x[0])[i+1] for i in range(len(x)-1)] # headlines
    data += ((x[0])[i+1] for i in range(len(x)-1)) # bodies

    vocab = create_vocab(dataset)[0]
    
    # fit to dataset (creates dictionary)
    bow = CountVectorizer(
        vocabulary=vocab,
        max_features=5000,
    )
    tfidf = TfidfVectorizer(
        vocabulary=vocab,
        max_features=5000,
    )
    bow.fit(data)
    tfidf.fit(data)
    print('vectorizer ready!')
    return bow, tfidf

In [42]:
# test block (works, already added to dataloader class)
bow, tfidf = fit_vectorizers(df)
test_input = ['weee']

print(len(bow.vocabulary_))
bow.transform(test_input).shape

fitting vectorizer...
creating vocabulary...
vectorizer ready!
5000


(1, 5000)

In [114]:
# custom dataloader class
class StanceDataset(Dataset):
    def __init__(self, stance_df):
        """
        Args:
            stance_df (pandas.DataFrame): the dataset
            vectorizer (TfidfVectorizer): sklearn vectorizer
        """
        self.data = stance_df
        #self._vocab = create_vocab(self.data)
        
        self._bow, self._tfidf = fit_vectorizers(self.data)
        
        # splits (train, test, validation)
        self.train, self.test = train_test_split(self.data, test_size=0.3, shuffle=True)
        self.test, self.val = train_test_split(self.test, test_size=0.5, shuffle=False)
        # split sizes
        self.train_len = len(self.train)
        self.test_len = len(self.test)
        self.val_len = len(self.val)
        
        self._lookup_dict = {'train': (self.train, self.train_len),
                             'val': (self.val, self.val_len),
                             'test': (self.test, self.test_len)}

    # returns length of dataset
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        """
        primary entry
        Args:
            index (int): index to current data point
        Returns:
            dictionary holding data point feature (x_data) and label (y_target)
        """
        
        # get text values from current row ------!!! CHANGE ACCORDING TO DF !!!! -----
        claim = [self.data[0][index]]
        body = [self.data[3][index]]

        
        # transform claim (headline/tweet) and target (body) to bowTF vector
        bow_x = self._bow.transform(claim)
        bow_y = self._bow.transform(body)

        # transform claim (headline/tweet) and target (body) to TFIDF vector
        tfidf_x = self._tfidf.transform(claim)
        tfidf_y = self._tfidf.transform(body)
        
        # take cosine similary of TFIDF vectors
        cosim = cosine_similarity(tfidf_x, tfidf_y)
        
        # concat x-cosim-y to input vector
        claim_df = pd.DataFrame(bow_x.toarray()) 
        body_df = pd.DataFrame(bow_y.toarray())
        cosim_df = pd.DataFrame(cosim)
        
        tenk = (pd.concat([claim_df, cosim_df, body_df],axis=1)).to_numpy()
        tenk = torch.from_numpy(tenk)
        
        return tenk

In [115]:
def load_dataset():
    """
    load dataset and vectorizer
    """
    print('loading dataset...')
    df = pd.read_excel('fake_bananas_structuredKopie_4.xlsx', index_col=None, header=None)
    df = df.drop([0], axis=0)
    print('loaded!')
    # instantiate vectorizer
    #tfidf_vectorizer = vectorizer_init(df)
    
    dataset = StanceDataset(df)
    return dataset
    

In [116]:
dataset = load_dataset()

loading dataset...
loaded!
fitting vectorizer...
creating vocabulary...
vectorizer ready!


In [117]:
"""
This is what the train loop should do each line. 
"""

item = dataset.__getitem__(1)
print(item)

tensor([[0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)


## MLP setup and training

In [126]:
train_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True, num_workers=1)

In [None]:
"""
this simulates the training loop.
Problems
    - strange key error still happens
    - when dataset.train is passed into DataLoader() instead, the key error happens way sooner
    (is this because it's simply smaller or because .train doesnt work correctly?)

"""

for i in iter(train_loader):
    print(i)