# Feature Extraction for Emotion Analysis
### 1. Bag of Words

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
d = {'Text': ['I am just so bitter today', 'yuck!So creepy'],
     'Emotion': ['anger', 'disgust']}
sample_data = pd.DataFrame(data=d)
sample_data

Unnamed: 0,Text,Emotion
0,I am just so bitter today,anger
1,yuck!So creepy,disgust


**Unigram**

In [3]:
count_vect = CountVectorizer()
count_vect.fit(sample_data['Text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [4]:
print("Feature names ", count_vect.get_feature_names())

Feature names  ['am', 'bitter', 'creepy', 'just', 'so', 'today', 'yuck']


In [5]:
print(count_vect.fit_transform(sample_data['Text']).toarray()) 

[[1 1 0 1 1 1 0]
 [0 0 1 0 1 0 1]]


**Unigram & Bigram**

In [6]:
count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram_counts = count_vect.fit_transform(sample_data['Text'])
print("Feature names ", count_vect.get_feature_names())

Feature names  ['am', 'am just', 'bitter', 'bitter today', 'creepy', 'just', 'just so', 'so', 'so bitter', 'so creepy', 'today', 'yuck', 'yuck so']


### 2. TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tf_idf_vect = TfidfVectorizer()
TFIDF_data = tf_idf_vect.fit_transform(sample_data['Text'])
print("Feature names ", tf_idf_vect.get_feature_names())

Feature names  ['am', 'bitter', 'creepy', 'just', 'so', 'today', 'yuck']


In [9]:
print(TFIDF_data.toarray())

[[0.47107781 0.47107781 0.         0.47107781 0.33517574 0.47107781
  0.        ]
 [0.         0.         0.6316672  0.         0.44943642 0.
  0.6316672 ]]


### 3. Bag of Words with SVD 

**Sample Data**

In [10]:
d = {'Text': 
     ['I am just so bitter today', 
      'yuck! So creepy',
      "that's what I'm afraid of!",
      "Oh! You planned a surprise party for me!",
      "When we give cheerfully and accept gratefully, everyone is blessed",
      "The three R's depress me.",
      "I don't talk about politics because people nowadays get offended easily!",
      "No, We have stupid, dismal, lame winters where we maybe get some dangerous ice once.",
      "Wow! What a beautiful wetehr today!",
      "I'm getting so nervous for my first anatomy exam",
      "Eww! Why are spitting around?",
      "easy, breezy, beautiful way"
     ],
     'Emotion': 
     ['anger', 
      'disgust',
      'fear',
      'surprise',
      'happy',
      'sadness',
      'anger',
      'sadness',
      'surprise',
      'fear',
      'disgust',
      'happy'
     ]}
sample_data = pd.DataFrame(data=d)
sample_data

Unnamed: 0,Text,Emotion
0,I am just so bitter today,anger
1,yuck! So creepy,disgust
2,that's what I'm afraid of!,fear
3,Oh! You planned a surprise party for me!,surprise
4,"When we give cheerfully and accept gratefully,...",happy
5,The three R's depress me.,sadness
6,I don't talk about politics because people now...,anger
7,"No, We have stupid, dismal, lame winters where...",sadness
8,Wow! What a beautiful wetehr today!,surprise
9,I'm getting so nervous for my first anatomy exam,fear


**Bag of Words Counts**

In [11]:
count_vect = CountVectorizer()
count_vect.fit(sample_data['Text'])
bow_counts = count_vect.fit_transform(sample_data['Text'])

**SVD**

In [12]:
from sklearn import decomposition

In [13]:
t_svd= decomposition.TruncatedSVD()
t_svd.n_components = 2
svd_data = t_svd.fit_transform(bow_counts.toarray())

In [14]:
print("shape of train data = ", bow_counts.shape)
print("shape of truncated svd = ", svd_data.shape)

shape of train data =  (12, 70)
shape of truncated svd =  (12, 2)


In [15]:
print(svd_data)

[[ 5.46644039e-18  1.92392274e-15]
 [ 1.01739884e-15  9.33343749e-16]
 [-2.09512199e-16 -7.75860491e-16]
 [-1.47968094e-16  1.98593287e-15]
 [ 1.05391628e+00 -1.41421356e+00]
 [-8.31110955e-16  6.89392226e-16]
 [ 5.26958140e-01  2.82842712e+00]
 [ 4.03297502e+00 -2.53176598e-15]
 [ 5.21251528e-16  8.34277553e-16]
 [-4.86264367e-16  5.92103387e-15]
 [ 5.93909498e-17  1.20366455e-15]
 [ 3.68392992e-16  1.53436129e-16]]


# Feature Classification for Emotion Analysis
### 1. Bag of Words with Logistic Regression
**Bag of Words**

In [18]:
count_vect = CountVectorizer()
count_vect.fit(sample_data['Text'])
final_counts = count_vect.fit_transform(sample_data['Text'])

**Logistic Regression**

In [19]:
from sklearn.linear_model import LogisticRegression

In [21]:
model_lg = LogisticRegression()
model_lg.fit(final_counts, sample_data['Emotion'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
Y = count_vect.transform(["you are sick"])
print(model_lg.predict(Y))

['disgust']


In [23]:
Y = count_vect.transform(["no one cares"])
print(model_lg.predict(Y))

['sadness']


### 2. TF-IDF with Random Forest
**TFIDF**

In [24]:
tf_idf_vect = TfidfVectorizer()
TFIDF_data = tf_idf_vect.fit_transform(sample_data['Text'])

**Random Forest**

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf_clf = RandomForestClassifier()
rf_clf.fit(TFIDF_data, sample_data['Emotion'])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
Y = tf_idf_vect.transform(["i am afraid of change"])
print(rf_clf.predict(Y))

['fear']


In [29]:
Y = tf_idf_vect.transform(["wow beautiful flowers"])
print(rf_clf.predict(Y))

['happy']


### 3. Bag of Words with Neural Network
[smartboost - Feature Classification for Emotion Analysis - Neural Network.ipynb](https://github.com/yingjie-dev/smartboost-opinion-mining/blob/master/smartboost%20-%20Feature%20Classification%20for%20Emotion%20Analysis%20-%20Neural%20Network.ipynb)