 # <center> Lab 4 : Twitter sentiment Analysis

### Name : Taha Nouali

## part 1 : practical example using SentiWordnet

#**Importing Required Libraries**

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()

sentence = "One of the best movie of all time. Period."

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\a\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\a\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\sentiwordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\a\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\a\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


#**Text PreProcessing**

#### (1) Removing Punctuations

In [2]:
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

for x in sentence: 
  if x in punctuations: 
    sentence = sentence.replace(x, "")

print(sentence)

One of the best movie of all time Period


#### (2) Change Case + Tokenization

In [3]:
Tokens = nltk.word_tokenize(sentence.lower())
print(Tokens)

['one', 'of', 'the', 'best', 'movie', 'of', 'all', 'time', 'period']


#### (3) Removing Stop Words - a, an, the, are, is etc.

In [4]:
stop_words = set(stopwords.words('english'))
clean_Tokens = [word for word in Tokens if word not in stop_words]
print(clean_Tokens)

['one', 'best', 'movie', 'time', 'period']


#### (4) Lemmatization

In [5]:
lemma = [lemmatizer.lemmatize(word) for word in clean_Tokens]

print(lemma)

['one', 'best', 'movie', 'time', 'period']


#### (5) POS Tagging

A part-of-speech tagger, or POS-tagger, processes a sequence of words, and attaches a part of speech tag to each word (don't forget to import nltk)

In [6]:
lemma

['one', 'best', 'movie', 'time', 'period']

In [7]:
help(nltk.pos_tag)

Help on function pos_tag in module nltk.tag:

pos_tag(tokens, tagset=None, lang='eng')
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
    
        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
    
    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
    
    :param tokens: Sequence of tokens to be tagged

In [8]:
pos_val = nltk.pos_tag(lemma)
print(pos_val)

pos=neg=obj=count=0

[('one', 'CD'), ('best', 'JJS'), ('movie', 'NN'), ('time', 'NN'), ('period', 'NN')]


CC, a coordinating conjunction;

RB, or adverbs;  

IN, a preposition; 

NN, a noun; 

JJ, an adjective.

The most popular tag set is Penn Treebank tagset. Most of the already trained taggers for English are trained on this tag set. To view the complete list, follow this link--> https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html.

In [9]:
# We can get more details about any POS tag using help funciton of NLTK as follows.
nltk.download('tagsets')
nltk.help.upenn_tagset("JJS$")

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\a\AppData\Roaming\nltk_data...


JJS: adjective, superlative
    calmest cheapest choicest classiest cleanest clearest closest commonest
    corniest costliest crassest creepiest crudest cutest darkest deadliest
    dearest deepest densest dinkiest ...


[nltk_data]   Unzipping help\tagsets.zip.


In [17]:
#nltk.download('universal_tagset')
nltk.download('universal_tagset')
pos_val2 = nltk.pos_tag(lemma, tagset='universal')
print(pos_val2)

[('dont', 'NOUN')]


[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\a\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


#**Functions for Sentiment Scoring**

In [10]:
# Convert between the PennTreebank tags to simple Wordnet tags
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

you can find the whol peeTreebank here https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [21]:
# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        return []

    #Lemmatization
    #lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    #if not lemma:
     #   return []

    #Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet. 
    #Synset instances are the groupings of synonymous words that express the same concept. 
    #Some of the words have only one Synset and some have several.
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

In [22]:
get_sentiment('happy', 'J')

['happy.a.01', 0.875, 0.0, 0.125]

In [23]:
senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
print(senti_val)

[[], ['best.a.01', 0.75, 0.0, 0.25], ['movie.n.01', 0.0, 0.0, 1.0], ['time.n.01', 0.0, 0.0, 1.0], ['time_period.n.01', 0.125, 0.125, 0.75]]


#**Aggregating Scores**

In [24]:
for i in range(len(senti_val)):
  try:
    pos = pos + senti_val[i][1]
    neg = neg + senti_val[i][2]
    obj = obj + senti_val[i][3]
  
  except:
    continue

In [25]:
print("Positive weight : {0} ".format(pos))
print("Negative weight : {0} ".format(neg))
print("Sentiment of the statement is {0} ".format(pos - neg))

Positive weight : 0.875 
Negative weight : 0.125 
Sentiment of the statement is 0.75 


## part 2: twitter sentiment Analysis

In [90]:
#data
#!pip install openpyxl

In [26]:
import pandas as pd
Tweet = pd.read_excel("US_Airline_Tweets.xlsx") # Weather_Final.csv # Airline_Final.csv

In [27]:
data=Tweet['Tweets']
data

0                    @VirginAmerica What @dhepburn said.
1      @VirginAmerica plus you've added commercials t...
2      @VirginAmerica I didn't today... Must mean I n...
3      @VirginAmerica it's really aggressive to blast...
4      @VirginAmerica and it's a really big bad thing...
                             ...                        
995    @united #UnitedAirlines Pls Fix #AspenBaggageF...
996    @united Read my bio. See who I work with. I ha...
997    @united Does customer care have email or a pho...
998    Thank you â€œ@united: @TRUU_Tall I can certain...
999    @united In the process of recovering their car...
Name: Tweets, Length: 1000, dtype: object

In [28]:
import re 
#preprocessing
def tweet_preprocess(raw_tweet):
    
    raw_tweet = re.sub("@\w+","",raw_tweet).strip()
    raw_tweet = re.sub("http\S+","",raw_tweet).strip()
    letters_only = re.sub("[^a-zA-Z]", " ",raw_tweet) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    lemma = [lemmatizer.lemmatize(word) for word in meaningful_words]
    return lemma#( " ".join( meaningful_words )) 

In [29]:
new_data=[]
for raw in data:
    raw=tweet_preprocess(raw)
    new_data.append(raw)
    print(raw)

['said']
['plus', 'added', 'commercial', 'experience', 'tacky']
['today', 'must', 'mean', 'need', 'take', 'another', 'trip']
['really', 'aggressive', 'blast', 'obnoxious', 'entertainment', 'guest', 'face', 'amp', 'little', 'recourse']
['really', 'big', 'bad', 'thing']
['seriously', 'would', 'pay', 'flight', 'seat', 'playing', 'really', 'bad', 'thing', 'flying', 'va']
['yes', 'nearly', 'every', 'time', 'fly', 'vx', 'ear', 'worm', 'go', 'away']
['really', 'missed', 'prime', 'opportunity', 'men', 'without', 'hat', 'parody']
['well']
['amazing', 'arrived', 'hour', 'early', 'good']
['know', 'suicide', 'second', 'leading', 'cause', 'death', 'among', 'teen']
['lt', 'pretty', 'graphic', 'much', 'better', 'minimal', 'iconography']
['great', 'deal', 'already', 'thinking', 'nd', 'trip', 'amp', 'even', 'gone', 'st', 'trip', 'yet', 'p']
['flying', 'fabulous', 'seductive', 'sky', 'u', 'take', 'stress', 'away', 'travel']
['thanks']
['sfo', 'pdx', 'schedule', 'still', 'mia']
['excited', 'first', 'cros

In [30]:
len(new_data)

1000

## To do : for each tweet get it sentiment using sentiwordnet
NB : add you name and ubmit your work under Lab4_submission 

In [39]:
for j,lemma in enumerate(new_data):
    pos_val = nltk.pos_tag(lemma)
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    #Aggregating Scores
    pos,neg,obj = 0,0,0
    for i in range(len(senti_val)):
        try:
            pos = pos + senti_val[i][1]
            neg = neg + senti_val[i][2]
            obj = obj + senti_val[i][3]
        
        except:
            continue
    print("Sentiment of the statement",j," is {0} ".format(pos - neg))

Sentiment of the statement 0  is 0.0 
Sentiment of the statement 1  is 0.0 
Sentiment of the statement 2  is 0.125 
Sentiment of the statement 3  is 0.0 
Sentiment of the statement 4  is 0.125 
Sentiment of the statement 5  is 0.25 
Sentiment of the statement 6  is 0.0 
Sentiment of the statement 7  is 0.625 
Sentiment of the statement 8  is 0.375 
Sentiment of the statement 9  is 0.5 
Sentiment of the statement 10  is 0.125 
Sentiment of the statement 11  is 0.875 
Sentiment of the statement 12  is 0.5 
Sentiment of the statement 13  is 1.0 
Sentiment of the statement 14  is 0.125 
Sentiment of the statement 15  is -0.125 
Sentiment of the statement 16  is -0.25 
Sentiment of the statement 17  is 0.5 
Sentiment of the statement 18  is 0.0 
Sentiment of the statement 19  is 1.0 
Sentiment of the statement 20  is 0.375 
Sentiment of the statement 21  is 0.625 
Sentiment of the statement 22  is 1.25 
Sentiment of the statement 23  is 0.125 
Sentiment of the statement 24  is 0.375 
Sentim