In [None]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

In [None]:
# Imports
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Stopwords
Stopwords are words that are very common and add little meaning
examples: a, of, the

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

### PorterStemmer
Words that have the same stem, typically have the same meaning
PorterStemmer cuts off the affixes so you just use the stem -> reduces word count (features)

In [None]:
ps = nltk.PorterStemmer()

# Extract Data

In [None]:
data_twitter_og = pd.read_csv("../Twitter_Data_for_NLP.csv", encoding = "ISO-8859-1")
data_twitter_og.columns = ["tweet", "sentiment"]
data_twitter_og

### Eliminate Empty tweets! Probably pictures or memes 

In [None]:
data_twitter_og["tweet"] = data_twitter_og["tweet"].astype("string")
data_twitter_cleaned = data_twitter_og.loc[pd.notna(data_twitter_og["tweet"]),:].copy()
data_twitter = data_twitter_cleaned.sample(n=10000)

In [None]:
data_twitter.reset_index(inplace=True)
data_twitter.drop(columns="index", inplace=True)

In [None]:
data_twitter

# Prepare Data

## Feature Creation
1. Punctuation percentage
2. Text Length
3. Captialization percentage

In [None]:
# Function to count punctuation
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    if (len(text) - text.count(" ")) ==0:   # Need to avoid dividing by 0
        return 0
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data_twitter['punct%'] = data_twitter['tweet'].apply(lambda x: count_punct(x))

In [None]:
# Determine length of headline and make it a column
data_twitter['text_len'] = data_twitter['tweet'].apply(lambda x: len(x) - x.count(" "))

In [None]:
# Function to determine capitalization percentage
def capital_percent(text):
    count = sum([1 for char in text if char.isupper()])
    if (len(text) - text.count(" ")) == 0:      # Avoid dividing by 0
        return 0
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data_twitter['capital%'] = data_twitter['tweet'].apply(lambda x: capital_percent(x))

## Clean Data

In [None]:
# Function to clean up data
# Eliminate punctuation
# Make everything lowercase
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]  # Use portstemmer
    return text

## Vectorization

### TF-IDF
Inverse Document Frequency Weighting 
* Creates a document-term matrix where the cells contain a weighting of how important that word is to the text
* How much does a word differentiate a text message from othes? Pulls out important but seldom used words

In [None]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
tfidf_vect_twitter = TfidfVectorizer(analyzer=clean_text) 

# Fit and transform model 
X_tfidf_twitter = tfidf_vect_twitter.fit_transform(data_twitter['tweet'])

# Create df to see vectorization and concatenated created features
X_tfidf_feat_twitter = pd.concat([data_twitter['text_len'], data_twitter['punct%'], data_twitter['capital%'], pd.DataFrame(X_tfidf_twitter.toarray())], axis=1)
X_tfidf_feat_twitter.head(5)

### Count Vectorizer
* Creates a document term matrix where the entry of each cell will be a count of the number of times that word occurred in that document

In [None]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
count_vect_twitter = CountVectorizer(analyzer=clean_text)

# Fit and Transform model
X_count_twitter = count_vect_twitter.fit_transform(data_twitter['tweet'])

# Create df to see vectorization and concatenated created features
X_count_feat_twitter = pd.concat([data_twitter['text_len'], data_twitter['punct%'], data_twitter['capital%'], pd.DataFrame(X_count_twitter.toarray())], axis=1)

X_count_feat_twitter.head()

# Random Forest on Holdout Test Set

In [None]:
# X_tfidf_feat_twitter
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat_twitter, data_twitter['sentiment'], test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred)

In [None]:
round((y_pred==y_test).sum() / len(y_pred),3)