# Imports

In [2]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

### Stopwords
Stopwords are words that are very common and add little meaning
examples: a, of, the

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

### PorterStemmer
Words that have the same stem, typically have the same meaning
PorterStemmer cuts off the affixes so you just use the stem -> reduces word count (features)

In [4]:
ps = nltk.PorterStemmer()

# Read in Data

In [5]:
data = pd.read_csv("financial_news_sentiments_train.csv", encoding = "ISO-8859-1")
data.columns = ['sentiment', 'headlines']
data.head()

Unnamed: 0,sentiment,headlines
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


# Prepare Data

## Feature Creation
1. Punctuation percentage
2. Text Length
3. Captialization percentage

In [6]:
# Function to count punctuation
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data['punct%'] = data['headlines'].apply(lambda x: count_punct(x))

In [7]:
# Determine length of headline and make it a column
data['text_len'] = data['headlines'].apply(lambda x: len(x) - x.count(" "))

In [8]:
# Function to determine capitalization percentage
def capital_percent(text):
    count = sum([1 for char in text if char.isupper()])
    return round(count/(len(text) - text.count(" ")), 3)*100

# Apply function to make new column
data['capital%'] = data['headlines'].apply(lambda x: capital_percent(x))

## Clean Data

In [None]:
# Function to clean up data
# Eliminate punctuation
# Make everything lowercase
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

## Vectorization