In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import pandas as pd
import math
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize
nltk.download('words')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm,tqdm_pandas
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression

tqdm_pandas(tqdm())

pd.set_option('display.expand_frame_repr', False)
PATH="/home/yui/Documents/data/nlp/covidTweet/Corona_NLP_train.csv"
df = pd.read_csv(PATH, encoding = "ISO-8859-1")
df = df[["OriginalTweet","Sentiment"]]

def loadGlove(PATH):
    d,w2id,id2w = {},{},{}
    with open(PATH,'r') as f:
        lines = f.readlines()
        D = len(lines[0].split(" ")[1:])
        wmat = np.zeros((len(lines),D))
        for i,line in enumerate(lines):
            tokens = line.split(" ")
            word = tokens[0]
            w2id[word]=i
            id2w[i]=word
            vec = np.array(list(map(float,tokens[1:])))
            d[i]=vec
            wmat[i]=vec
    return d,w2id,id2w,wmat,D
PATH = "/home/yui/Documents/data/nlp/glove.6B/glove.6B.50d.txt"
d,w2id,id2w,wmat,D = loadGlove(PATH)

[nltk_data] Downloading package words to /home/yui/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
0it [00:00, ?it/s]
  from pandas import Panel


In [2]:
class preprocessPipe:
    def __init__(self):
        self.eng = set(nltk.corpus.words.words())
        self.lem = WordNetLemmatizer()
        self.w2d = dict() # word:({docid:count,...},nt)
        self.s2id = dict()
    def get_wordnet_pos(self,word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag,wordnet.NOUN)
    def lemmWord(self,w):
        return self.lem.lemmatize(w,
                self.get_wordnet_pos(w))
    def tokenSentence(self,s):
        return nltk.wordpunct_tokenize(s)
    def sent2id(self,sentence):
        ind = len(self.s2id)+1
        self.s2id[sentence]=ind
        return ind
    def run(self,sentence):
        sentence = sentence.lower()
        ind = self.sent2id(sentence)
        tokens = self.tokenSentence(sentence)
        vec,lenEng = 0,0
        for w in tokens:
            if w not in self.eng:
                continue
            w = self.lemmWord(w)
            if w not in w2id:
                continue
            vec+=d[w2id[w]]
            lenEng+=1
        if lenEng==0:
            return np.zeros(D),lenEng
        return vec/lenEng,lenEng

In [3]:
pp = preprocessPipe()
df["Input"],df["Counts"]=zip(*df["OriginalTweet"]\
                    .progress_apply(pp.run))

41157it [00:59, 692.74it/s]


#### Bag of words
- Embedding $v$ of the sentence $s$ is given by, 

$$v(s) = \frac{1}{|s|}\sum_{w\in s}v(w)$$

In [4]:
wordLim = 5
df = df[df["Counts"]>wordLim]

In [5]:
labels = list(df["Sentiment"].unique())
print(labels)
df["Output"]=df["Sentiment"].apply(lambda x:labels.index(x))
df["Output"]=df["Output"].apply(lambda x:0 if x in [0,2,3] else 1)
df.head(5)

['Positive', 'Extremely Negative', 'Neutral', 'Negative', 'Extremely Positive']


Unnamed: 0,OriginalTweet,Sentiment,Input,Counts,Output
1,advice Talk to your neighbours family to excha...,Positive,"(0.33271789285714287, 0.17252346428571425, 0.1...",28,0
2,Coronavirus Australia: Woolworths to give elde...,Positive,"(0.4782337499999999, 0.07637737499999996, 0.28...",8,0
3,My food stock is not the only one which is emp...,Positive,"(0.38000114054054046, 0.03723013513513513, 0.1...",37,0
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"(0.31456214, 0.017596571428571423, 0.059678925...",35,1
5,As news of the regionÂs first confirmed COVID...,Positive,"(0.31297992857142853, 0.10409821428571424, 0.0...",28,0


In [7]:
X = list(df["Input"])
Y = list(df["Output"])
clf = LogisticRegression(max_iter=1000).fit(X,Y)

In [8]:
clf.score(X,Y)

0.7034127301841473