In [1]:
import torch
import torch.nn as nn

import pandas as pd
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [2]:
tweets_df = pd.read_csv('./training.1600000.processed.noemoticon.csv', engine='python', header=None, encoding='latin1')
tweets_df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
tweets_df[0].value_counts()

0
0    800000
4    800000
Name: count, dtype: int64

In [4]:
tweets_df['sentiment_cat'] = tweets_df[0].astype('category')
tweets_df['sentiment'] = tweets_df['sentiment_cat'].cat.codes

In [5]:
tweets_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,sentiment_cat,sentiment
1321983,4,2014746526,Wed Jun 03 02:33:02 PDT 2009,NO_QUERY,davidliew,"Finally finished those many works today, now c...",4,1
131810,0,1835612983,Mon May 18 06:22:55 PDT 2009,NO_QUERY,kluper,@no634 aww the pre-finals paper jam happened t...,0,0
1394560,4,2053630459,Sat Jun 06 05:12:18 PDT 2009,NO_QUERY,beanandgone,@hasbean coffee shop crawl?,4,1
327881,0,2010150562,Tue Jun 02 16:53:52 PDT 2009,NO_QUERY,ERK,16GB SD still works after reformat. I just got...,0,0
493588,0,2184649495,Mon Jun 15 16:13:00 PDT 2009,NO_QUERY,eireanneilis,needs a job that makes more than peanuts or ju...,0,0
1477171,4,2066230315,Sun Jun 07 10:07:45 PDT 2009,NO_QUERY,TLM26,@runnersrambles thanks for running with my slo...,4,1
1225027,4,1990629503,Mon Jun 01 05:21:55 PDT 2009,NO_QUERY,RAWRachael,@saltyshutter sandstone drive (by st Michaels ...,4,1
85656,0,1753898765,Sun May 10 04:01:22 PDT 2009,NO_QUERY,lanied3ph,says work again tomorrow http://plurk.com/p/s...,0,0
1378692,4,2052040131,Fri Jun 05 23:29:13 PDT 2009,NO_QUERY,diveacademy,is diving at the Mogan Wrecks and Medio Elmud,4,1
1275155,4,2001054099,Mon Jun 01 23:42:29 PDT 2009,NO_QUERY,kaokun,@tomlapille reading Learning Perl when I was i...,4,1


In [6]:
tweets_df.to_csv('train-processed.csv', header=None, index=None)
tweets_df.sample(10000).to_csv('train-processed-sample.csv', header=None, index=None)

In [7]:
nlp = spacy.load('en_core_web_sm')

def tokenize(tweet) -> list:
    return [word.text.lower() for word in nlp(tweet)]


def build_vocab(tweets, min_freq=1):
    counter = Counter()
    for tweet in tweets:
        counter.update(tweet)

    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    
    return vocab

def collate_fn(batch):
    tweets, labels = zip(*batch)
    tweets = pad_sequence(tweets, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return tweets, labels

class TweetDataset(Dataset):
    def __init__(self, tweets, labels, vocab=None) -> None:
        self.tokens = [tokenize(tweet) for tweet in tweets]
        self.labels = labels
        self.vocab = vocab or build_vocab(self.tokens)
        self.data = [self.token_to_num(tokens) for tokens in self.tokens]
    
    def token_to_num(self, tokens):
        return [self.vocab.get(token, self.vocab["<unk>"]) for token in tokens]

    def __len__(self) -> int:
        return len(self.labels)
    
    def __getitem__(self, index):
        return torch.tensor(self.data[index]), torch.tensor(self.labels[index])

In [8]:

batch_size = 64
num_workers = 4
sample_tweets_df = pd.read_csv('./train-processed-sample.csv', header=None)

tweets = sample_tweets_df.loc[:, 5]
labels = sample_tweets_df.loc[:, 7]

tweets_train, tweets_temp, labels_train, labels_temp = train_test_split(
    tweets, labels, test_size=0.3, random_state=42, stratify=labels
)
tweets_val, tweets_test, labels_val, labels_test = train_test_split(
    tweets_temp, labels_temp, test_size=0.5, random_state=42, stratify=labels_temp
)

train_data = TweetDataset(tweets=tweets_train, labels=labels_train)
val_data = TweetDataset(tweets=tweets_val, labels=labels_val)
test_data = TweetDataset(tweets=tweets_test, labels=labels_test)

train_loader = DataLoader(dataset=train_data, shuffle=True, batch_size=64, num_workers=num_workers, collate_fn=collate_fn)
val_loader = DataLoader(dataset=val_data, batch_size=64, num_workers=num_workers, collate_fn=collate_fn)
test_loader = DataLoader(dataset=test_data, batch_size=64, num_workers=num_workers, collate_fn=collate_fn)

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def tokenize(text):
    return [tok.text.lower() for tok in nlp(text)]


def build_vocab(token_lists, min_freq=1):
    counter = Counter()
    for tokens in token_lists:
        counter.update(tokens)

    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(word)
    
    return vocab



In [None]:
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, vocab=None):
        self.tokens = [tokenize(tweet) for tweet in tweets]
        self.labels = labels
        self.vocab = vocab or build_vocab(self.tokens)
        self.data = [self.numericalize(tokens) for tokens in self.tokens]

    def numericalize(self, tokens):
        return [self.vocab.get(tok, self.vocab["<unk>"]) for tok in tokens]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return torch.tensor(self.data[index]), torch.tensor(self.labels[index])

In [None]:
def collate_fn(batch):
    tweets, labels = zip(*batch)
    tweets = pad_sequence(tweets, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return tweets, labels

In [None]:
batch_size = 64

sample_tweets_df = pd.read_csv('./train-processed-sample.csv', header=None)

all_tweets = sample_tweets_df.loc[:, 5]
all_labels = sample_tweets_df.loc[:, 7]

tweets_train, tweets_temp, labels_train, labels_temp = train_test_split(
    all_tweets, all_labels, test_size=0.3, random_state=42, stratify=all_labels
)

tweets_val, tweets_test, labels_val, labels_test = train_test_split(
    tweets_temp, labels_temp, test_size=0.5, random_state=42, stratify=labels_temp
)

train_data = TweetDataset(tweets=tweets_train, labels=labels_train)
val_data = TweetDataset(tweets=tweets_val, labels=labels_val)
test_data = TweetDataset(tweets=tweets_test, labels=labels_test)

train_loader = DataLoader(dataset=train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True, num_workers=4)
val_loader = DataLoader(dataset=val_data, batch_size=batch_size, collate_fn=collate_fn, num_workers=4)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, collate_fn=collate_fn, num_workers=4)

In [None]:
cat_mat_embed = nn.Embedding(5, 2)
sentences = torch.arange(5)
cat_mat_embed.forward(sentences)