In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df = pd.read_csv("Corona_NLP_train.csv", encoding='latin-1')

In [3]:
df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [4]:
df["Sentiment"].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [5]:
for i in df["OriginalTweet"][:3]:
    print(i)

@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8
advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order
Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak https://t.co/bInCA9Vp8P


In [6]:
df = df.loc[:,["OriginalTweet", "Sentiment"]]

In [7]:
df.columns = ["text", "sentiment"]

In [8]:
df.isnull().sum()

text         0
sentiment    0
dtype: int64

# Text Preprocessing

In [9]:
import nltk
import re
import string

In [10]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [11]:
df["filtered_text"] = df["text"].apply(clean_text)

In [12]:
df.head()

Unnamed: 0,text,sentiment,filtered_text
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,menyrbie philgahan chrisitv and and
1,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...
2,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...
3,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the outb...


In [13]:
from nltk.corpus import stopwords

In [14]:
stopword = stopwords.words("english")

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/yahor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
df["filtered_text"] = df["filtered_text"].apply(nltk.word_tokenize)

In [17]:
df.head()

Unnamed: 0,text,sentiment,filtered_text
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,"[menyrbie, philgahan, chrisitv, and, and]"
1,advice Talk to your neighbours family to excha...,Positive,"[advice, talk, to, your, neighbours, family, t..."
2,Coronavirus Australia: Woolworths to give elde...,Positive,"[coronavirus, australia, woolworths, to, give,..."
3,My food stock is not the only one which is emp...,Positive,"[my, food, stock, is, not, the, only, one, whi..."
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"[me, ready, to, go, at, supermarket, during, t..."


In [18]:
def remove_stopwords(x):
    arr = []
    for word in x:
        if word not in stopword:
           arr.append(word)
    return arr

In [19]:
df["filtered_text"] = df["filtered_text"].apply(remove_stopwords)

In [20]:
df.head()

Unnamed: 0,text,sentiment,filtered_text
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,"[menyrbie, philgahan, chrisitv]"
1,advice Talk to your neighbours family to excha...,Positive,"[advice, talk, neighbours, family, exchange, p..."
2,Coronavirus Australia: Woolworths to give elde...,Positive,"[coronavirus, australia, woolworths, give, eld..."
3,My food stock is not the only one which is emp...,Positive,"[food, stock, one, empty, please, dont, panic,..."
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"[ready, go, supermarket, outbreak, im, paranoi..."


In [21]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

In [22]:
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer("english")

In [23]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/yahor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
def lemma(x):
    arr = []
    for word in x:
        arr.append(wordnet_lemmatizer.lemmatize(word))
    return arr

In [25]:
def stemming(x):
    arr = []
    for word in x:
        arr.append(snowball_stemmer.stem(word))
    return arr

In [26]:
def lemma_stemming(x):
    arr = lemma(x)
    arr = stemming(arr)
    return arr

In [27]:
df["lemma_text"] = df["filtered_text"].apply(lemma)

In [28]:
df["stemming_text"] = df["filtered_text"].apply(stemming)

In [29]:
df["lemma_stemming_text"] = df["filtered_text"].apply(lemma_stemming)

In [30]:
df.head()

Unnamed: 0,text,sentiment,filtered_text,lemma_text,stemming_text,lemma_stemming_text
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,"[menyrbie, philgahan, chrisitv]","[menyrbie, philgahan, chrisitv]","[menyrbi, philgahan, chrisitv]","[menyrbi, philgahan, chrisitv]"
1,advice Talk to your neighbours family to excha...,Positive,"[advice, talk, neighbours, family, exchange, p...","[advice, talk, neighbour, family, exchange, ph...","[advic, talk, neighbour, famili, exchang, phon...","[advic, talk, neighbour, famili, exchang, phon..."
2,Coronavirus Australia: Woolworths to give elde...,Positive,"[coronavirus, australia, woolworths, give, eld...","[coronavirus, australia, woolworth, give, elde...","[coronavirus, australia, woolworth, give, elde...","[coronavirus, australia, woolworth, give, elde..."
3,My food stock is not the only one which is emp...,Positive,"[food, stock, one, empty, please, dont, panic,...","[food, stock, one, empty, please, dont, panic,...","[food, stock, one, empti, pleas, dont, panic, ...","[food, stock, one, empti, pleas, dont, panic, ..."
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"[ready, go, supermarket, outbreak, im, paranoi...","[ready, go, supermarket, outbreak, im, paranoi...","[readi, go, supermarket, outbreak, im, paranoi...","[readi, go, supermarket, outbreak, im, paranoi..."


In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
le = LabelEncoder()

In [33]:
df["sentiment"] = le.fit_transform(df["sentiment"])

In [34]:
df.head()

Unnamed: 0,text,sentiment,filtered_text,lemma_text,stemming_text,lemma_stemming_text
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,3,"[menyrbie, philgahan, chrisitv]","[menyrbie, philgahan, chrisitv]","[menyrbi, philgahan, chrisitv]","[menyrbi, philgahan, chrisitv]"
1,advice Talk to your neighbours family to excha...,4,"[advice, talk, neighbours, family, exchange, p...","[advice, talk, neighbour, family, exchange, ph...","[advic, talk, neighbour, famili, exchang, phon...","[advic, talk, neighbour, famili, exchang, phon..."
2,Coronavirus Australia: Woolworths to give elde...,4,"[coronavirus, australia, woolworths, give, eld...","[coronavirus, australia, woolworth, give, elde...","[coronavirus, australia, woolworth, give, elde...","[coronavirus, australia, woolworth, give, elde..."
3,My food stock is not the only one which is emp...,4,"[food, stock, one, empty, please, dont, panic,...","[food, stock, one, empty, please, dont, panic,...","[food, stock, one, empti, pleas, dont, panic, ...","[food, stock, one, empti, pleas, dont, panic, ..."
4,"Me, ready to go at supermarket during the #COV...",0,"[ready, go, supermarket, outbreak, im, paranoi...","[ready, go, supermarket, outbreak, im, paranoi...","[readi, go, supermarket, outbreak, im, paranoi...","[readi, go, supermarket, outbreak, im, paranoi..."


# Test DataFrame Preprocessing

In [35]:
df_test = pd.read_csv("Corona_NLP_test.csv", encoding='latin-1')

In [36]:
def preprocess(x):
    arr = []
    for word i

SyntaxError: invalid syntax (1545085199.py, line 3)

# Building a model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

In [None]:
X = df["text"]

In [None]:
vectorizer = TfidfVectorizer(stop_words="english")

In [None]:
X_train = vectorizer.fit_transform(X)