Comparing the actual label with the VADER prediction, we get an accuracy score of 0.768

# Imports

In [2]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

In [None]:
# Imports
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Stopwords
Stopwords are words that are very common and add little meaning
examples: a, of, the

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

### PorterStemmer
Words that have the same stem, typically have the same meaning
PorterStemmer cuts off the affixes so you just use the stem -> reduces word count (features)

In [4]:
ps = nltk.PorterStemmer()

# Read in Data

In [5]:
data = pd.read_csv("financial_news_sentiments_train.csv", encoding = "ISO-8859-1")
data.columns = ['sentiment', 'headlines']
data.head()

Unnamed: 0,sentiment,headlines
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


# Prepare Data

## Feature Creation
1. Punctuation percentage
2. Text Length
3. Captialization percentage

In [6]:
# Function to count punctuation
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data['punct%'] = data['headlines'].apply(lambda x: count_punct(x))

In [7]:
# Determine length of headline and make it a column
data['text_len'] = data['headlines'].apply(lambda x: len(x) - x.count(" "))

In [8]:
# Function to determine capitalization percentage
def capital_percent(text):
    count = sum([1 for char in text if char.isupper()])
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data['capital%'] = data['headlines'].apply(lambda x: capital_percent(x))

## Clean Data

In [9]:
# Function to clean up data
# Eliminate punctuation
# Make everything lowercase
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]  # Use portstemmer
    return text

## Vectorization

### TF-IDF
Inverse Document Frequency Weighting 
* Creates a document-term matrix where the cells contain a weighting of how important that word is to the text
* How much does a word differentiate a text message from othes? Pulls out important but seldom used words

In [12]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
tfidf_vect = TfidfVectorizer(analyzer=clean_text) 

# Fit and transform model 
X_tfidf = tfidf_vect.fit_transform(data['headlines'])

# Create df to see vectorization and concatenated created features
X_tfidf_feat = pd.concat([data['text_len'], data['punct%'], data['capital%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

### Count Vectorizer
* Creates a document term matrix where the entry of each cell will be a count of the number of times that word occurred in that document

In [13]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
count_vect = CountVectorizer(analyzer=clean_text)

# Fit and Transform model
X_count = count_vect.fit_transform(data['headlines'])

# Create df to see vectorization and concatenated created features
X_count_feat = pd.concat([data['text_len'], data['punct%'], data['capital%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,text_len,punct%,capital%,0,1,2,3,4,5,6,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
0,160,1.9,0.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,193,1.6,2.1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,174,0.6,0.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,163,6.7,1.2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,154,1.3,21.4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Parameter Settings

In [15]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data['sentiment'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,39.483314,1.167703,0.522142,0.087084,,300,"{'max_depth': None, 'n_estimators': 300}",0.69453,0.688338,0.635707,0.652219,0.663571,0.666873,0.022012,1
8,51.573586,2.324569,1.263336,0.273069,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.696594,0.698658,0.629515,0.639835,0.663571,0.665635,0.028368,2
10,34.212505,2.768572,0.906632,0.386668,,150,"{'max_depth': None, 'n_estimators': 150}",0.692466,0.691434,0.635707,0.640867,0.665635,0.665222,0.024058,3
5,42.425284,1.016262,1.271919,0.36479,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.713106,0.683179,0.623323,0.631579,0.653251,0.660888,0.033323,4
7,26.763226,1.563173,1.016106,0.264996,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.702786,0.682147,0.630547,0.626419,0.659443,0.660268,0.029381,5
