In [1]:
#Installing Afinn Lexicon
!pip install contractions
!pip install afinn
from afinn import Afinn

You should consider upgrading via the '/Users/victornathanael/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/victornathanael/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
#Importing Libraries for Processing the Data
import numpy as np
import pandas as pd
import re,string,unicodedata
%matplotlib inline
import contractions

In [3]:
#Importing Libraries For Text Preprocessing
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
#Preparing the Dict
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/victornathanael/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/victornathanael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victornathanael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
#Adding Labels
import csv

with open('data.csv',newline='',encoding = 'unicode_escape') as f:
    r = csv.reader(f)
    data = [line for line in r]
with open('data.csv','w',newline='', encoding= 'unicode_escape') as f:
    w = csv.writer(f)
    w.writerow(['sentiment_from_dataset','news'])
    w.writerows(data)

In [7]:
#Import the dataset

data = pd.read_csv("data.csv", encoding= 'unicode_escape')
print(data.head())

  sentiment_from_dataset                                               news
0                neutral  According to Gran , the company has no plans t...
1                neutral  Technopolis plans to develop in stages an area...
2               negative  The international electronic industry company ...
3               positive  With the new production plant the company woul...
4               positive  According to the company 's updated strategy f...


In [8]:
#Remove unused column (Name and Date) and missing values
data = data.dropna()
print(data.head())

  sentiment_from_dataset                                               news
0                neutral  According to Gran , the company has no plans t...
1                neutral  Technopolis plans to develop in stages an area...
2               negative  The international electronic industry company ...
3               positive  With the new production plant the company woul...
4               positive  According to the company 's updated strategy f...


In [9]:
#Preprocessing

#Case Folding
data['news'] = data['news'].str.lower() #Lowercase the sentence

#Tokenizing 
data['news'] = data['news'].str.strip() #Remove Leading Space and Trailing Space

from string import punctuation

def remove_punct(text):
  for punctuations in punctuation:
    text = text.replace(punctuations, '')
  return text

data['news'] = data['news'].apply(remove_punct) #Remove Punctuation

def remove_special_char(text, remove_digits=True):
  pattern = r'[^a-zA-z0-9\s]'
  text = re.sub(pattern, '', text)
  return text

data['news'] = data['news'].apply(remove_special_char) #Remove Symbols or other special characters

def expand_contractions(con_text):
  con_text = contractions.fix(con_text)
  return con_text
  
data['news'] = data['news'].apply(expand_contractions) #Expand English Contractions (Ex : I've -> I have)

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


data['news'] = data['news'].apply(remove_accented_chars) #Remove macrons & accented characters

#Filtering
tokenizer = ToktokTokenizer()
stopword_list = set(stopwords.words('english'))

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

data['news'] = data['news'].apply(remove_stopwords) 
print(data.head())

  sentiment_from_dataset                                               news
0                neutral  according gran company plans move production r...
1                neutral  technopolis plans develop stages area less 100...
2               negative  international electronic industry company elco...
3               positive  new production plant company would increase ca...
4               positive  according company updated strategy years 20092...


In [10]:
#Using Afinn Lexicon to classify the sentence
af = Afinn()

def afinn_sent_analysis(text):
  score = af.score(text)
  return score

#applying the function to Normalized Comments
data['afinn_score'] = [afinn_sent_analysis(comm) for comm in data['news']]

#If Afinn Score is more than 0 : The text contains Positive Sentiment
                #  less than 0 : The text contains Negative Sentiment
                #  equals to 0 : The text contains Neutral Sentiment
        
def afinn_sent_category(score):
  categories = ['positive','negative','neutral']
  if score > 0:
    return categories[0]
  elif score < 0:
    return categories[1]
  else:
    return categories[2]  

data['sentiment_results_afinn'] = [afinn_sent_category(scr) for scr in data['afinn_score']]
print(data)

     sentiment_from_dataset  \
0                   neutral   
1                   neutral   
2                  negative   
3                  positive   
4                  positive   
...                     ...   
4841               negative   
4842                neutral   
4843               negative   
4844               negative   
4845               negative   

                                                   news  afinn_score  \
0     according gran company plans move production r...          1.0   
1     technopolis plans develop stages area less 100...          0.0   
2     international electronic industry company elco...          0.0   
3     new production plant company would increase ca...          4.0   
4     according company updated strategy years 20092...          4.0   
...                                                 ...          ...   
4841  london marketwatch share prices ended lower lo...         -3.0   
4842  rinkuskiai beer sales fell 65 per cent 416 mi

In [13]:
data['comparison'] = (data['sentiment_from_dataset'] == data['sentiment_results_afinn'])
print(data)

     sentiment_from_dataset  \
0                   neutral   
1                   neutral   
2                  negative   
3                  positive   
4                  positive   
...                     ...   
4841               negative   
4842                neutral   
4843               negative   
4844               negative   
4845               negative   

                                                   news  afinn_score  \
0     according gran company plans move production r...          1.0   
1     technopolis plans develop stages area less 100...          0.0   
2     international electronic industry company elco...          0.0   
3     new production plant company would increase ca...          4.0   
4     according company updated strategy years 20092...          4.0   
...                                                 ...          ...   
4841  london marketwatch share prices ended lower lo...         -3.0   
4842  rinkuskiai beer sales fell 65 per cent 416 mi

In [18]:
data["comparison"].value_counts()

True     2885
False    1961
Name: comparison, dtype: int64

In [19]:
accuracy = 2885/4845
print("Accuracy = ", accuracy)

Accuracy =  0.5954592363261094
