Comparing the actual label with the VADER prediction, we get an accuracy score of 0.768

# Imports

In [2]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

In [3]:
# Imports
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Stopwords
Stopwords are words that are very common and add little meaning
examples: a, of, the

In [4]:
stopwords = nltk.corpus.stopwords.words('english')

### PorterStemmer
Words that have the same stem, typically have the same meaning
PorterStemmer cuts off the affixes so you just use the stem -> reduces word count (features)

In [5]:
ps = nltk.PorterStemmer()

# Read in Data

In [6]:
data_news = pd.read_csv("Financial_News_Data_NLP.csv", encoding = "ISO-8859-1")
data_news.columns = ["sentiment", "headlines"]
data_news.head()

Unnamed: 0,sentiment,headlines
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [7]:
data_twitter_og = pd.read_csv("../Twitter_Data_for_NLP.csv", encoding = "ISO-8859-1")
data_twitter_og.columns = ["tweet", "sentiment"]
data_twitter_og

Unnamed: 0,tweet,sentiment
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


### Eliminate Empty tweets! Probably pictures or memes 

In [8]:
data_twitter_og["tweet"] = data_twitter_og["tweet"].astype("string")
data_twitter = data_twitter_og.loc[pd.notna(data_twitter_og["tweet"]),:].copy()

In [18]:
len(data_twitter)

162976

# Prepare Data

## Feature Creation
1. Punctuation percentage
2. Text Length
3. Captialization percentage

In [19]:
# Function to count punctuation
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    if (len(text) - text.count(" ")) ==0:   # Need to avoid dividing by 0
        return 0
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data_news['punct%'] = data_news['headlines'].apply(lambda x: count_punct(x))
data_twitter['punct%'] = data_twitter['tweet'].apply(lambda x: count_punct(x))

In [21]:
len(data_twitter["punct%"])

162976

In [10]:
# Determine length of headline and make it a column
data_news['text_len'] = data_news['headlines'].apply(lambda x: len(x) - x.count(" "))

data_twitter['text_len'] = data_twitter['tweet'].apply(lambda x: len(x) - x.count(" "))

In [11]:
# Function to determine capitalization percentage
def capital_percent(text):
    count = sum([1 for char in text if char.isupper()])
    if (len(text) - text.count(" ")) == 0:      # Avoid dividing by 0
        return 0
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data_news['capital%'] = data_news['headlines'].apply(lambda x: capital_percent(x))

data_twitter['capital%'] = data_twitter['tweet'].apply(lambda x: capital_percent(x))

## Clean Data

In [12]:
# Function to clean up data
# Eliminate punctuation
# Make everything lowercase
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]  # Use portstemmer
    return text

## Vectorization

### TF-IDF
Inverse Document Frequency Weighting 
* Creates a document-term matrix where the cells contain a weighting of how important that word is to the text
* How much does a word differentiate a text message from othes? Pulls out important but seldom used words

In [23]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
tfidf_vect_news = TfidfVectorizer(analyzer=clean_text)

# Fit and transform model 
X_tfidf_news = tfidf_vect_news.fit_transform(data_news['headlines'])

# Create df to see vectorization and concatenated created features
X_tfidf_feat_news = pd.concat([data_news['text_len'], data_news['punct%'], data_news['capital%'], pd.DataFrame(X_tfidf_news.toarray())], axis=1)
X_tfidf_feat_news.head(5)

Unnamed: 0,text_len,punct%,capital%,0,1,2,3,4,5,6,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
0,160,1.9,0.6,0.039911,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,193,1.6,2.1,0.036051,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,174,0.6,0.6,0.039099,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,163,6.7,1.2,0.038290,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,154,1.3,21.4,0.033612,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4840,119,2.5,12.6,0.034775,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4841,119,6.7,2.5,0.028435,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4842,86,5.8,11.6,0.040842,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4843,183,3.8,7.7,0.030181,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
tfidf_vect_twitter = TfidfVectorizer(analyzer=clean_text) 

# Fit and transform model 
X_tfidf_twitter = tfidf_vect_twitter.fit_transform(data_twitter['tweet'])



In [25]:
X_tfidf_twitter_array = pd.DataFrame(X_tfidf_twitter.toarray())

In [22]:
len(data_twitter["text_len"])

162976

In [16]:
data_twitter["punct%"]

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
162975    0.0
162976    0.0
162977    0.0
162978    0.0
162979    0.0
Name: punct%, Length: 162976, dtype: float64

In [17]:
data_twitter["capital%"]

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
162975    0.0
162976    0.0
162977    0.0
162978    0.0
162979    0.0
Name: capital%, Length: 162976, dtype: float64

In [24]:
X_tfidf_feat_twitter = pd.concat([data_twitter['text_len'], data_twitter['punct%'], data_twitter['capital%']], axis=1)
X_tfidf_feat_twitter.head(5)

Unnamed: 0,text_len,punct%,capital%
0,182,0.0,0.0
1,55,0.0,0.0
2,95,0.0,0.0
3,179,0.0,0.0
4,67,0.0,0.0


In [14]:
# Create df to see vectorization and concatenated created features
X_tfidf_feat_twitter = pd.concat([data_twitter['text_len'], data_twitter['punct%'], data_twitter['capital%'], pd.DataFrame(X_tfidf_twitter.toarray())], axis=1)
X_tfidf_feat_twitter.head(5)

: 

: 

### Count Vectorizer
* Creates a document term matrix where the entry of each cell will be a count of the number of times that word occurred in that document

In [13]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
count_vect_news = CountVectorizer(analyzer=clean_text)

# Fit and Transform model
X_count_news = count_vect.fit_transform(data_news['headlines'])

# Create df to see vectorization and concatenated created features
X_count_feat = pd.concat([data_news['text_len'], data_news['punct%'], data_news['capital%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,text_len,punct%,capital%,0,1,2,3,4,5,6,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
0,160,1.9,0.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,193,1.6,2.1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,174,0.6,0.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,163,6.7,1.2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,154,1.3,21.4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Parameter Settings

In [15]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_news_feat, data_news['sentiment'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,39.483314,1.167703,0.522142,0.087084,,300,"{'max_depth': None, 'n_estimators': 300}",0.69453,0.688338,0.635707,0.652219,0.663571,0.666873,0.022012,1
8,51.573586,2.324569,1.263336,0.273069,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.696594,0.698658,0.629515,0.639835,0.663571,0.665635,0.028368,2
10,34.212505,2.768572,0.906632,0.386668,,150,"{'max_depth': None, 'n_estimators': 150}",0.692466,0.691434,0.635707,0.640867,0.665635,0.665222,0.024058,3
5,42.425284,1.016262,1.271919,0.36479,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.713106,0.683179,0.623323,0.631579,0.653251,0.660888,0.033323,4
7,26.763226,1.563173,1.016106,0.264996,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.702786,0.682147,0.630547,0.626419,0.659443,0.660268,0.029381,5
