In [1]:
#Installing Afinn Lexicon
!pip install contractions
!pip install afinn
from afinn import Afinn

You should consider upgrading via the '/Users/victornathanael/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/victornathanael/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
#Importing Libraries for Processing the Data
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import re,string,unicodedata
%matplotlib inline
import contractions

In [3]:
#Importing Libraries For Text Preprocessing
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
#Preparing the Dict
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/victornathanael/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/victornathanael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victornathanael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
#Import the dataset
data = pd.read_csv("data.csv", encoding= 'unicode_escape')
print(data.head())

   Unnamed: 0                                               text name  \
0           1  Theresa May has struck a deal with Brussels th...   UK   
1           2  The agreement may give UK firms continued acce...   UK   
2           3  Nobody knows when, but the Brexit secretary, D...   UK   
3           4  There's no shame in asking voters to express t...   UK   
4           5  Dominic Raab warns that "obstacles remain" but...   UK   

         Date  sentiment  
0  2018-11-01          0  
1  2018-11-01          1  
2  2018-11-01          0  
3  2018-11-01          0  
4  2018-11-01          0  


In [6]:
#Remove unused column (Name and Date) and missing values
data = data.drop(columns = ['name','Date','sentiment'])
data = data.dropna()
print(data.head())

   Unnamed: 0                                               text
0           1  Theresa May has struck a deal with Brussels th...
1           2  The agreement may give UK firms continued acce...
2           3  Nobody knows when, but the Brexit secretary, D...
3           4  There's no shame in asking voters to express t...
4           5  Dominic Raab warns that "obstacles remain" but...


In [7]:
#Preprocessing

#Case Folding
data['text'] = data['text'].str.lower() #Lowercase the sentence

#Tokenizing 
data['text'] = data['text'].str.strip() #Remove Leading Space and Trailing Space

from string import punctuation

def remove_punct(text):
  for punctuations in punctuation:
    text = text.replace(punctuations, '')
  return text

data['text'] = data['text'].apply(remove_punct) #Remove Punctuation

def remove_special_char(text, remove_digits=True):
  pattern = r'[^a-zA-z0-9\s]'
  text = re.sub(pattern, '', text)
  return text

data['text'] = data['text'].apply(remove_special_char) #Remove Symbols or other special characters

def expand_contractions(con_text):
  con_text = contractions.fix(con_text)
  return con_text
  
data['text'] = data['text'].apply(expand_contractions) #Expand English Contractions (Ex : I've -> I have)

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


data['text'] = data['text'].apply(remove_accented_chars) #Remove macrons & accented characters

#Filtering
tokenizer = ToktokTokenizer()
stopword_list = set(stopwords.words('english'))

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

data['text'] = data['text'].apply(remove_stopwords) 
print(data.head())

   Unnamed: 0                                               text
0           1  theresa may struck deal brussels would give uk...
1           2  agreement may give uk firms continued access e...
2           3  nobody knows brexit secretary dominic raab lik...
3           4  shame asking voters express views second time ...
4           5  dominic raab warns obstacles remain insists en...


In [10]:
#Using Afinn Lexicon to classify the sentence
af = Afinn()

def afinn_sent_analysis(text):
  score = af.score(text)
  return score

#applying the function to Normalized Comments
data['afinn_score'] = [afinn_sent_analysis(comm) for comm in data['text']]

#If Afinn Score is more than 0 : The text contains Positive Sentiment
                #  less than 0 : The text contains Negative Sentiment
                #  equals to 0 : The text contains Neutral Sentiment
        
def afinn_sent_category(score):
  categories = ['Good News','Bad News','Neutral']
  if score > 0:
    return categories[0]
  elif score < 0:
    return categories[1]
  else:
    return categories[2]  

data['afinn_sent_category'] = [afinn_sent_category(scr) for scr in data['afinn_score']]
print(data)

       Unnamed: 0                                               text  \
0               1  theresa may struck deal brussels would give uk...   
1               2  agreement may give uk firms continued access e...   
2               3  nobody knows brexit secretary dominic raab lik...   
3               4  shame asking voters express views second time ...   
4               5  dominic raab warns obstacles remain insists en...   
...           ...                                                ...   
93608       93609  lagence nationale de scurit du mdicament et de...   
93609       93610  les dputs ont rcemment adopt un amendement au ...   
93610       93611  la mesure doit tre dbattue par les dputs alors...   
93611       93612  la taille et la forme du canal pelvignital des...   
93612       93613  sant le corps des femmes recle encore de nombr...   

       afinn_score afinn_sent_category  
0             -1.0            Bad News  
1              1.0           Good News  
2           