In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import pandas as pd
import math
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize
nltk.download('words')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm,tqdm_pandas

tqdm_pandas(tqdm())

pd.set_option('display.expand_frame_repr', False)
PATH="/home/yui/Documents/data/nlp/covidTweet/Corona_NLP_train.csv"
df = pd.read_csv(PATH, encoding = "ISO-8859-1")
df = df[["OriginalTweet","Sentiment"]]

def loadGlove(PATH):
    d,w2id,id2w = {},{},{}
    with open(PATH,'r') as f:
        lines = f.readlines()
        D = len(lines[0].split(" ")[1:])
        wmat = np.zeros((len(lines),D))
        for i,line in enumerate(lines):
            tokens = line.split(" ")
            word = tokens[0]
            w2id[word]=i
            id2w[i]=word
            vec = np.array(list(map(float,tokens[1:])))
            d[i]=vec
            wmat[i]=vec
    return d,w2id,id2w,wmat,D

class preprocessPipe:
    def __init__(self):
        self.eng = set(nltk.corpus.words.words())
        self.lem = WordNetLemmatizer()
        self.w2d = dict() # word:({docid:count,...},nt)
        self.s2id = dict()
    def get_wordnet_pos(self,word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag,wordnet.NOUN)
    def lemmWord(self,w):
        return self.lem.lemmatize(w,
                self.get_wordnet_pos(w))
    def tokenSentence(self,s):
        return nltk.wordpunct_tokenize(s)
    def sent2id(self,sentence):
        ind = len(self.s2id)+1
        self.s2id[sentence]=ind
        return ind
    def run(self,sentence):
        sentence = sentence.lower()
        ind = self.sent2id(sentence)
        tokens = self.tokenSentence(sentence)
        vec,lenEng = 0,0
        for w in tokens:
            if w not in self.eng:
                continue
            w = self.lemmWord(w)
            if w not in w2id:
                continue
            vec+=d[w2id[w]]
            lenEng+=1
        if lenEng==0:
            return np.zeros(D),lenEng
        return vec/lenEng,lenEng
    
PATH = "/home/yui/Documents/data/nlp/glove.6B/glove.6B.50d.txt"
d,w2id,id2w,wmat,D = loadGlove(PATH)

pp = preprocessPipe()
df["Input"],df["Counts"]=zip(*df["OriginalTweet"]\
                    .progress_apply(pp.run))

[nltk_data] Downloading package words to /home/yui/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
0it [00:00, ?it/s]
  from pandas import Panel
41157it [01:01, 671.14it/s]


In [14]:
df["CountWords"]=df["OriginalTweet"].apply(lambda x:len(x.split(" ")))

In [15]:
df["CountWords"].describe()

count    41157.000000
mean        30.327818
std         11.633754
min          1.000000
25%         21.000000
50%         31.000000
75%         40.000000
max        127.000000
Name: CountWords, dtype: float64

In [2]:
df.head(5)

Unnamed: 0,OriginalTweet,Sentiment,Input,Counts
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,"(-0.120218, 0.428472, 0.464072, 0.5489208, 0.2...",5
1,advice Talk to your neighbours family to excha...,Positive,"(0.33271789285714287, 0.17252346428571425, 0.1...",28
2,Coronavirus Australia: Woolworths to give elde...,Positive,"(0.4782337499999999, 0.07637737499999996, 0.28...",8
3,My food stock is not the only one which is emp...,Positive,"(0.38000114054054046, 0.03723013513513513, 0.1...",37
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"(0.31456214, 0.017596571428571423, 0.059678925...",35


In [12]:
df.Counts.describe()

count    41157.000000
mean        23.341497
std         10.568691
min          0.000000
25%         15.000000
50%         24.000000
75%         31.000000
max         61.000000
Name: Counts, dtype: float64

In [8]:
originalTweets = df["OriginalTweet"]
vocab,maxLen = set(),0
for i in tqdm(range(len(df))):
    ele = originalTweets.iloc[i].lower()
    tokens = list(filter(lambda x:x in pp.eng,ele.split(" ")))
    if len(tokens)>maxLen:
        maxLen=len(tokens)
    vocab|=set(tokens)

100%|██████████| 41157/41157 [00:00<00:00, 100101.11it/s]


In [9]:
print(len(vocab),maxLen)

12010 55


In [10]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

In [11]:
m = nn.Conv1d(16, 33, 3, stride=2)
input = torch.randn(20, 16, 50)
output = m(input)
print(output.shape)

torch.Size([20, 33, 24])


In [13]:
m = nn.Conv2d(16, 33, 3, stride=2)
input = torch.randn(20, 16, 50, 100)
output = m(input)
print(output.shape)

torch.Size([20, 33, 24, 49])
