In [120]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [132]:
# test block to tinker with vectorizer, add to custom dataloader once finished
df = pd.read_excel('fake_bananas_structuredKopie_4.xlsx', index_col=None, header=None)
df = df.drop([0], axis=0)
print(df.shape)
df.head()

def vectorizer_init(dataset):
    
    x = dataset.iloc[:,0:1]
    x2 = dataset.iloc[:,2:3]
    # create array of all texts in dataset
    data = [(x[0])[i+1] for i in range(len(x)-1)]
    data += ((x[0])[i+1] for i in range(len(x)-1))
    
    # fit to dataset (creates dictionary)
    tfidf = TfidfVectorizer(max_features=5000)
    tfidf.fit(data)
    
    return tfidf

(3231, 4)


In [131]:
# try test block (works, already added to dataloader class)
v = vectorizer_init(df)
test_input = ['weee']

print(len(v.vocabulary_))
v.transform(test_input).shape

2857


(1, 2857)

In [133]:
# custom dataloader class
class StanceDataset(Dataset):
    def __init__(self, stance_df, vectorizer):
        """
        Args:
            stance_df (pandas.DataFrame): the dataset
            vectorizer (TfidfVectorizer): sklearn vectorizer
        """
        self.data = stance_df
        self._vectorizer = vectorizer
        
        # splits (train, test, validation)
        self.train, self.test = train_test_split(self.data, test_size=0.3, shuffle=True)
        self.test, self.val = train_test_split(self.test, test_size=0.5, shuffle=False)
        # split sizes
        self.train_len = len(self.train)
        self.test_len = len(self.test)
        self.val_len = len(self.val)
        
        self._lookup_dict = {'train': (self.train, self.train_len),
                             'val': (self.val, self.val_len),
                             'test': (self.test, self.test_len)}

    # returns length of dataset
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        """
        primary entry
        Args:
            index (int): index to current data point
        Returns:
            dictionary holding data point feature (x_data) and label (y_target)
        """
        
        # get text values from current row ------!!! CHANGE ACCORDING TO DF !!!! -----
        claim = [self.data[0][index]]
        body = [self.data[3][index]]
        
        # transform claim (headline/tweet) and target (body) to tfidf vector
        x = self._vectorizer.transform(claim)
        y = self._vectorizer.transform(body)
        
        # take cosine similary
        cosim = cosine_similarity(x, y)
        
        # concat x-cosim-y to input vector
        claim_df = pd.DataFrame(x.toarray()) 
        body_df = pd.DataFrame(y.toarray())
        cosim_df = pd.DataFrame(cosim)
        
        tenk = (pd.concat([claim_df, cosim_df, body_df],axis=1)).to_numpy()
        #tenk = tenk.to_numpy()
        #tenk = torch.from_numpy(tenk)
        
        return tenk

In [123]:
def load_dataset():
    """
    load dataset and vectorizer
    """
    print('loading dataset...')
    df = pd.read_excel('fake_bananas_structuredKopie_4.xlsx', index_col=None, header=None)
    df = df.drop([0], axis=0)
    print(df.head())
    # instantiate vectorizer
    tfidf_vectorizer = vectorizer_init(df)
    
    dataset = StanceDataset(df, tfidf_vectorizer)
    return dataset
    

In [124]:
dataset = load_dataset()

loading dataset...
                                                   0  1         2  \
1  Tourist dubbed ‘Spider Man’ after spider burro...  0     agree   
2  Giant 8ft 9in catfish weighing 19 stone caught...  0  disagree   
3  Enormous 20-stone catfish caught with fishing ...  0     agree   
4          ISIS Militants Allegedly Contracted Ebola  5     agree   
5  Matt Taibbi on leave of absence from First Loo...  5  disagree   

                                                   3  
1  A small meteorite crashed into a wooded area i...  
2  A small meteorite crashed into a wooded area i...  
3  A small meteorite crashed into a wooded area i...  
4  (NEWSER) – Wonder how long a Quarter Pounder w...  
5  (NEWSER) – Wonder how long a Quarter Pounder w...  


In [130]:
"""
This is what the train loop should do each line. 
The goal is that this returns the exactly 1001 sized vec, but the dict of our df is too smol :(
"""

item = dataset.__getitem__(1)
print(item)
item[0].shape

[[0. 0. 0. ... 0. 0. 0.]]


(5715,)