In [2]:
import urllib.request
import pandas as pd
import requests
from afinn import Afinn
import glob
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

### Sentiment Analysis

- Goal is to analyze the opinion/"sentiment" of a group, e.g., analyzing opinions of news reports, movies, tweets, instagram posts, etc.


- Often applied to social media postings and consumer reviews of products and services.

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/sunpark92/DAV-5400/master/crawling_huge_influencers.csv",encoding='latin1')

mid_influencer_path = https://raw.githubusercontent.com/sunpark92/DAV-5400/master/crawling_mid_influencers.csv \
micro_influencer_path = https://raw.githubusercontent.com/sunpark92/DAV-5400/master/crawling_micro_influencers.csv

In [4]:
import re

# parsing en only
def text_cleaning(text):
    en = re.compile('[^a-zA-Z]+')
    result = en.sub(' ', str(text))
    return result

df['en_text'] = df['caption'].apply(lambda x: text_cleaning(x))
del df['caption']
df[['profile_name','en_text','tags']]

Unnamed: 0,profile_name,en_text,tags
0,kendalljenner,The Row head to toe,
1,kendalljenner,Barb voted your turn,
2,kendalljenner,,
3,kendalljenner,don t call me babe happy halloween GO VOTE me...,
4,kendalljenner,jelly got me,
...,...,...,...
8270,amypurdygurl,The Fruits of Covid part As challenging as Cov...,
8271,amypurdygurl,I m getting stronger guys I can feel it My foc...,#teamstanley for supporting on the prep & rebu...
8272,amypurdygurl,When your dad saves your life The only way I k...,#kidneytransplant #organdonation #grat
8273,amypurdygurl,Celebrating years of my kidney transplant LIVE...,


### Tokenization

In [5]:
tokens=[]
for i in range(len(df)):
    sentence = df['en_text'][i] # input reviews into a test sentence to be tokenized
    token = nltk.word_tokenize(sentence) # apply the tokenizer
    tokens.append(token)
    
# view the tokens created by the tokenizer
print(tokens)



### Removing Stopwords from Text

In [6]:
# load the stopwords module
from nltk.corpus import stopwords

# now let's look at the English language stopwords
en_stops = set(stopwords.words('english'))
print(en_stops)

# how many English language stopwords are defined in NLTK?
print(len(en_stops))

{"you'll", 'are', 'most', "wouldn't", 'under', "you've", 'until', 'they', 'there', 'am', 's', 'wouldn', 'hasn', 'can', 'when', 'out', 'a', 'yourself', 'ourselves', 'having', 'after', 'very', "hasn't", 'was', "mustn't", 'will', "couldn't", 'my', 'is', 'through', 'herself', 'all', 'not', 'over', 'i', 'y', 'yourselves', 'same', 'own', 'don', 'mightn', 'shouldn', "she's", 'be', 'those', 'with', 'against', 'on', 'an', "hadn't", 'shan', 'whom', 'he', 'both', 'needn', 'him', 'during', 'by', 'then', 'didn', 'aren', "that'll", 'its', 'than', "doesn't", 'wasn', 'mustn', "shouldn't", 'it', 'themselves', 'theirs', 'does', 'but', 'our', 'and', 'where', 'what', 'off', 'ours', 'ma', 'more', 'isn', 'did', 'just', 'should', 'them', 'down', 'about', 'if', 'm', 'me', 'her', "needn't", 'doesn', "mightn't", 't', "haven't", 'hers', 'were', "aren't", 'itself', 'only', 'll', 'that', 'ain', 'nor', "shan't", 'hadn', 'some', 'himself', 'before', 'had', 'yours', "don't", 'no', "you're", 'these', 'we', 'which', 'h

In [7]:
# Remove Stopwords from a tokens and then put results into word_list
word_list=[]
word_list2 = []
word_list3 = []

for i in range(len(tokens)):
    word_list.append([word for word in tokens[i] if word not in en_stops])

for x in word_list:
    for j in x:
        word_list2.append(j.lower())
    
for i in word_list2:
    if i not in en_stops:
        word_list3.append(i)
        
print(word_list3[10:20])

['go', 'vote', 'pamela', 'anderson', 'barb', 'wire', 'angel', 'amberasaly', 'jelly', 'got']


In [8]:
print(len(word_list3))

146868


### NRC

In [10]:
# load NRC file 
NRC=pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',header=None,sep='\t')
NRC # there are only 0 and 1 scores # 141820 rows

Unnamed: 0,0,1,2
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
...,...,...,...
141815,zoom,negative,0
141816,zoom,positive,0
141817,zoom,sadness,0
141818,zoom,surprise,0


In [16]:
emotion=[]
for i in word_list3:
    temp=list(NRC.iloc[np.where(NRC[0]==i)[0],1])
    for j in temp:
        emotion.append(j)

In [17]:
# extract score 1 only 
# 141,820 > 13,901 rows
NRC=NRC[(NRC !=0).all(1)]
NRC=NRC.reset_index(drop=True)
NRC

Unnamed: 0,0,1,2
0,abacus,trust,1
1,abandon,fear,1
2,abandon,negative,1
3,abandon,sadness,1
4,abandoned,anger,1
...,...,...,...
13896,zest,anticipation,1
13897,zest,joy,1
13898,zest,positive,1
13899,zest,trust,1


In [18]:
sentiment_result=pd.Series(emotion).value_counts()
sentiment_result

positive        10226
joy              6300
anticipation     5574
trust            5457
negative         3613
surprise         2517
sadness          2097
fear             1992
anger            1557
disgust          1037
dtype: int64

### = influencers seems like using more positive(positive, joy, anticipation, trust, etc) words compared to using negatvie (nagative, sadness, fear, etc) words

### Afinn

In [19]:
afinn = Afinn(language='en')
afinn_score = []
for i in range(len(df)):
    afinn_score.append(afinn.score(df['en_text'][i]))
afinn_score

[0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 5.0,
 2.0,
 -1.0,
 2.0,
 0.0,
 0.0,
 4.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 4.0,
 0.0,
 2.0,
 0.0,
 0.0,
 0.0,
 9.0,
 0.0,
 0.0,
 -2.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 8.0,
 3.0,
 0.0,
 -1.0,
 2.0,
 6.0,
 0.0,
 1.0,
 6.0,
 9.0,
 0.0,
 17.0,
 8.0,
 10.0,
 7.0,
 6.0,
 6.0,
 6.0,
 11.0,
 9.0,
 0.0,
 0.0,
 -1.0,
 8.0,
 3.0,
 3.0,
 0.0,
 6.0,
 0.0,
 0.0,
 9.0,
 0.0,
 0.0,
 2.0,
 11.0,
 5.0,
 7.0,
 -1.0,
 3.0,
 1.0,
 4.0,
 35.0,
 2.0,
 1.0,
 21.0,
 9.0,
 1.0,
 3.0,
 0.0,
 10.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 12.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -2.0,
 0.0,
 -2.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -1.0,
 10.0,
 3.0,
 0.0,
 1.0,
 1.0,
 15.0,
 0.0,
 0.0,
 8.0,
 5.0,
 3.0,
 2.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -1.0,
 -1.0,
 -3.0,
 0.0,
 2.0,
 0.0,
 4.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 6.0,
 14.0,
 2.0,
 5.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -1.

In [21]:
df['afinn_score'] = afinn_score
# make data as binary data
df['y'] = df['afinn_score'].apply(lambda x: 1 if int(x)>=0.0 else 0)
df = df[['en_text','afinn_score','y']]
df

Unnamed: 0,en_text,afinn_score,y
0,The Row head to toe,0.0,1
1,Barb voted your turn,0.0,1
2,,0.0,1
3,don t call me babe happy halloween GO VOTE me...,3.0,1
4,jelly got me,0.0,1
...,...,...,...
8270,The Fruits of Covid part As challenging as Cov...,0.0,1
8271,I m getting stronger guys I can feel it My foc...,15.0,1
8272,When your dad saves your life The only way I k...,13.0,1
8273,Celebrating years of my kidney transplant LIVE...,11.0,1


### Create corpus index -> Convert words as X data

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
index_vectorizer = CountVectorizer(tokenizer = lambda x: [x for x in word_list3], stop_words = 'english')
X = index_vectorizer.fit_transform(df['en_text'])
print(X.shape)

  'stop_words.' % sorted(inconsistent))


In [None]:
index_vectorizer.vocabulary_ # kein: 4398th index

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_vectorizer = TfidfTransformer()
X = tfidf_vectorizer.fit_transform(X)
print(X.shape)
print(df['en_text'][0])
print(X[0])

### Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split

y = df['y']
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
print(x_train.shape)
print(x_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
lr = LogisticRegression(random_state = 0)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

In [None]:
# evaluate lr model 
print('Accuracy: %.2f'%accuracy_score(y_test, y_pred))
print('Precision: %.3f'%precision_score(y_test, y_pred))
print('Recall: %.2f'%recall_score(y_test, y_pred))
print('F1: %.2f'%f1_score(y_test, y_pred))

In [None]:
# get confusion matrix
from sklearn.metrics import confusion_matrix 

confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)

In [None]:
df['y'].value_counts()

In [None]:
# random sampling 1:1 ratio
positive_random_idx = df[df['y']==1].sample(100, random_state = 30).index.tolist()
negative_random_idx = df[df['y']==0].sample(100, random_state = 30).index.tolist()

# divide dataset as random dataset
random_idx = positive_random_idx + negative_random_idx
sample_X = X[random_idx]
y = df['y'][random_idx]
x_train, x_test, y_train, y_test = train_test_split(sample_X,y,test_size=0.3)
print(x_train.shape)
print(x_test.shape)

In [None]:
lr = LogisticRegression(random_state = 0)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

In [None]:
# evaluate lr model again
print('Accuracy: %.2f'%accuracy_score(y_test, y_pred))
print('Precision: %.3f'%precision_score(y_test, y_pred))
print('Recall: %.2f'%recall_score(y_test, y_pred))
print('F1: %.2f'%f1_score(y_test, y_pred))

In [None]:
# get get confusion matrix again
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)

### Analysis important keyword

In [None]:
import matplotlib.pyplot as plt

In [None]:
# sort regression model coef in decending order
coef_pos_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = True)

invert_index_vectorizer = {v:k for k,v in index_vectorizer.vocabulary_.items()}

In [None]:
print(str(invert_index_vectorizer)[:100])

In [None]:
#plt.rcParams['figure.figsize'] = [10,8]
#plt.bar(range(len(lr.coef_[0])), lr.coef_[0])

In [None]:
for coef in coef_pos_index[:20]:
    print(invert_index_vectorizer[coef[1]],coef[0])