# Task 1: Sentiment Labeling
## Import Library

In [1]:
!pip install vaderSentiment

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder



## Import dataset and explore

In [2]:
df = pd.read_csv("test(in).csv")
print(df.head())

                                        Subject  \
0                          EnronOptions Update!   
1                                  (No Subject)   
2  Phone Screen  Interview - Shannon L. Burnham   
3                         RE: My new work email   
4                                           Bet   

                                                body       date  \
0  EnronOptions Announcement\n\n\nWe have updated...  5/10/2010   
1  Marc,\n\nUnfortunately, today is not going to ...  7/29/2010   
2  When: Wednesday, June 06, 2001 10:00 AM-11:00 ...  7/25/2011   
3  we were thinking papasitos (we can meet somewh...  3/25/2010   
4  Since you never gave me the $20 for the last t...  5/21/2011   

                      from  
0     sally.beck@enron.com  
1      eric.bass@enron.com  
2     sally.beck@enron.com  
3  johnny.palmer@enron.com  
4  lydia.delgado@enron.com  


In [3]:
print(df.info())
print(df.describe())
print(df['body'].sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  2191 non-null   object
 1   body     2191 non-null   object
 2   date     2191 non-null   object
 3   from     2191 non-null   object
dtypes: object(4)
memory usage: 68.6+ KB
None
             Subject   body      date                     from
count           2191   2191      2191                     2191
unique          1251   1539       690                       10
top     (No Subject)  \n\n   7/1/2011  lydia.delgado@enron.com
freq             141     21         9                      284
1392       When are y'all headin' out of town?\n\nDB\n\n 
1826    You got the right one -- thanks for telling me...
2043    You're almost done signing up for PayPal!\n\nT...
1846           $1 bid for Bjornson\n$0 bid for greg clark
358     Tammie,\nAttached is the Global Risk Managemen...
Name: body, dtype: ob

## Use VADAR to assign psuedo labels and encoding text data into numerical data

I chose to use VADER, it is a lexicon and rule-based sentiment analysis tool designed for social media/text data. It uses a thresholded to assign 'positive', 'negative', or 'neutral' labels. VADER is good for informal, or social text (emails) because it handles negations, intensifiers, and slang well.
For encoding, I used TF-IDF to captures importance of words relative to the corpus. And I used frequency encoding for the email addresses which is useful when many unique senders exist and to avoids high-dimensional sparse encoding.

In [4]:
# Use only body for training\n",
X_text = TfidfVectorizer().fit_transform(df['body'].astype(str))
# Encode sender
sender_freq = df['from'].value_counts().to_dict()
df['from_freq'] = df['from'].map(sender_freq)

# Apply VADER for labels\n",
analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
    score = analyzer.polarity_scores(str(text))['compound']
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['body'].apply(get_sentiment)
# Apply VADER for initial sentiment labeling
df['sentiment'] = df['body'].apply(get_sentiment)
# Encode sentiment labels using a fitted encoder
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

# Combine features\n",
from scipy.sparse import hstack
X_combined = hstack([X_text, df[['from_freq']].values])

## splitting data into train and test set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

## train the logistic regression model

In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)