In [13]:
pip install vaderSentiment 


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
#importing data frame 
df_clean = pd.read_csv(r"C:\Users\tonym\Documents\Flatiron\phase_5\capstone\MBTI-ML-Social-Media-\data\df_clean.csv")

In [3]:
df_clean.head()

Unnamed: 0,type,Extrovert,Sensing,Thinking,Judging,posts,cleaned posts
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,and moments sportscenter not top ten p...
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,finding the lack these very alarming s...
2,INTP,0,0,1,0,'Good one _____ https://www.youtube.com/wat...,good one course which say know...
3,INTJ,0,0,1,1,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed our conversation the other ...
4,ENTJ,1,0,1,1,'You're fired.|||That's another silly misconce...,you fired that another silly misconception...


In [4]:
analyzer = SentimentIntensityAnalyzer()

nlp_sentiment_score = []

for post in df_clean['cleaned posts']:
    score = analyzer.polarity_scores(post)
    nlp_sentiment_score.append(score)
    

In [6]:
# creating columns for sentiment scores. We can create a compound column and
# then one for postive, negative and neutral scores
df_clean["compound_sentiment"] = [
    score["compound"] for score in nlp_sentiment_score
]
df_clean["positive_sent"] = [score["pos"] for score in nlp_sentiment_score]
df_clean["negative_sent"] = [score["neg"] for score in nlp_sentiment_score]
df_clean["neutral_sent"] = [score["neu"] for score in nlp_sentiment_score]

In [9]:
df_clean.head()


Unnamed: 0,type,Extrovert,Sensing,Thinking,Judging,posts,cleaned posts,compound_sentiment,positive_sent,negative_sent,neutral_sent
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,and moments sportscenter not top ten p...,0.9826,0.178,0.113,0.709
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,finding the lack these very alarming s...,0.9984,0.206,0.106,0.688
2,INTP,0,0,1,0,'Good one _____ https://www.youtube.com/wat...,good one course which say know...,0.9985,0.224,0.098,0.678
3,INTJ,0,0,1,1,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed our conversation the other ...,0.9985,0.176,0.067,0.757
4,ENTJ,1,0,1,1,'You're fired.|||That's another silly misconce...,you fired that another silly misconception...,0.9917,0.207,0.163,0.63


## POS TAGGING

A Part-Of-Speech Tagger (POS Tagger) is a piece of software that reads text in some language and assigns parts of speech to each word (and other token), such as noun, verb, adjective, etc., although generally computational applications use more fine-grained POS tags like 'noun-plural'. We can find the average of different parts of speech and add them to the clean data set. 

In [16]:
df_clean['tag_posts'] = df_clean['posts'].str.replace(re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
                                                      lambda match: match.group(2),
                                                     )
#replacing ||| with space 
df_clean['tag_posts'] = [post for post in df_clean["tag_posts"].str.split("\|\|\|")]

In [19]:
# pos tagging for each word 
df_clean["tagged_words"] = df_clean["tag_posts"].apply(
    lambda x: [nltk.pos_tag(word_tokenize(line)) for line in x]
)

In [20]:
# create list of point of speech tags 
tag_set = set()

for i, data in df_clean["tagged_words"].iteritems():
    for tup in data[0]:
        tag_set.add(tup[1])

tag_list = list(tag_set)

In [21]:
# calculating average and standard deviation of point of speech tags for each user
def pos_cat(x, tag):
    return [len([y for y in line if y[1] == tag]) for line in x]


for col in tag_list:
    df_clean["POS_" + col + "_mean"] = df_clean["tagged_words"].apply(
        lambda x: np.mean(pos_cat(x, col))
    )
    df_clean["POS_" + col + "_std"] = df_clean["tagged_words"].apply(
        lambda x: np.std(pos_cat(x, col))
    )

The tags that I will be using are from stanford's nlp list. It's based on the penn Treebank POS tagset We can add them to our data frame. The definitions of the tagging set look like this: 
1.	CC	Coordinating conjunction
2.	CD	Cardinal number
3.	DT	Determiner
4.	EX	Existential there
5.	FW	Foreign word
6.	IN	Preposition or subordinating conjunction
7.	JJ	Adjective
8.	JJR	Adjective, comparative
9.	JJS	Adjective, superlative
10.	LS	List item marker
11.	MD	Modal
12.	NN	Noun, singular or mass
13.	NNS	Noun, plural
14.	NNP	Proper noun, singular
15.	NNPS	Proper noun, plural
16.	PDT	Predeterminer
17.	POS	Possessive ending
18.	PRP	Personal pronoun
19.	PRP$	Possessive pronoun
20.	RB	Adverb
21.	RBR	Adverb, comparative
22.	RBS	Adverb, superlative
23.	RP	Particle
24.	SYM	Symbol
25.	TO	to
26.	UH	Interjection
27.	VB	Verb, base form

To read more about the Penn Tree Project:
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf

In [25]:
tags_dict = {
    "ADJ": ["JJ", "JJR", "JJS"],
    "ADP": ["EX", "TO"],
    "ADV": ["RB", "RBR", "RBS", "WRB"],
    "CONJ": ["CC", "IN"],
    "DET": ["DT", "PDT", "WDT"],
    "NOUN": ["NN", "NNS", "NNP", "NNPS"],
    "NUM": ["CD"],
    "PRT": ["RP"],
    "PRON": ["PRP", "PRP$", "WP", "WP$"],
    "VERB": ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
    ".": ["#", "$", "''", "(", ")", ",", ".", ":"],
    "X": ["FW", "LS", "UH"],
}

In [23]:
def stanford_tag(x, tag):
    tags_list = [len([y for y in line if y[1] in tags_dict[col]]) for line in x]
    return tags_list


for col in tags_dict.keys():
    df_clean[col + "_avg"] = df_clean["tagged_words"].apply(
        lambda x: np.median(stanford_tag(x, col))
    )

In [24]:
df_clean.head()

Unnamed: 0,type,Extrovert,Sensing,Thinking,Judging,posts,cleaned posts,compound_sentiment,positive_sent,negative_sent,...,ADV_avg,CONJ_avg,DET_avg,NOUN_avg,NUM_avg,PRT_avg,PRON_avg,VERB_avg,._avg,X_avg
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,and moments sportscenter not top ten p...,0.9826,0.178,0.113,...,1.0,2.0,1.0,4.0,0.0,0.0,1.0,2.0,2.5,0.0
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,finding the lack these very alarming s...,0.9984,0.206,0.106,...,2.0,4.0,2.0,6.0,0.0,0.0,4.0,6.0,4.0,0.0
2,INTP,0,0,1,0,'Good one _____ https://www.youtube.com/wat...,good one course which say know...,0.9985,0.224,0.098,...,2.0,1.0,1.0,4.0,0.0,0.0,2.0,4.0,3.0,0.0
3,INTJ,0,0,1,1,"'Dear INTP, I enjoyed our conversation the o...",dear enjoyed our conversation the other ...,0.9985,0.176,0.067,...,1.5,3.0,2.0,5.0,0.0,0.0,3.0,5.0,3.0,0.0
4,ENTJ,1,0,1,1,'You're fired.|||That's another silly misconce...,you fired that another silly misconception...,0.9917,0.207,0.163,...,2.0,2.0,2.0,5.0,0.0,0.0,3.0,5.0,3.0,0.0


In [26]:
# save updated csv 
df_clean.to_csv(r"C:\Users\tonym\Documents\Flatiron\phase_5\capstone\MBTI-ML-Social-Media-\data\df_clean2.csv", index = False)