## 1. Setup

### 1.1 Environment/Imports

In [51]:
import pandas as pd
import numpy as np
from datetime import datetime
from nltk import clean_html, SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

### 1.2 Read in data

In [52]:
dbase = '../assets/FULL DB VALUES.xlsx'
data = pd.read_excel(dbase, sheet='Sheet1')

data.head()

Unnamed: 0,Author,Author link,Description,Idea,Idea URL,Long,Performance,Quality,Submission Date,Rating Date,...,One Year Index Price,Two Year Index Price,Industry,PE,Psales,PFCF,EV,MKT,Country,ROIC
0,coda516,/member/coda516/11034,\nDescription\nRule #1 is “Never Lose Money.” ...,1-800-Contacts,https://valueinvestorsclub.com/idea/1-800-Cont...,True,,5.7,2006-11-06,2006-11-20,...,1439.7,806.58,Catalog/Specialty Distribution,,0.813072,,216.0832,191.964398,United States,-6.276761
1,Den1200,/member/Den1200/31058,\nDescription\nI recommend the purchase of BRK...,BERKSHIRE HATHAWAY,https://valueinvestorsclub.com/idea/BERKSHIRE_...,True,3.9,2.6,2016-08-23,2016-09-06,...,2208.73,2208.73,Financial Conglomerates,13.999246,1.632202,20.436016,398776.915475,367130.543924,United States,7.381747
1,piggybanker,/member/piggybanker/31371,\nDescription\n\nWe believe that the stock of ...,1-800-FLOWERS.COM,https://valueinvestorsclub.com/idea/1-800-FLOW...,True,4.3,4.0,2013-10-16,2013-10-30,...,1994.65,2079.36,Internet Retail,25.253501,0.452324,117.987542,401.66476,141.107958,United States,9.552421
2,ahnuld,/member/ahnuld/63009,\nDescription\nI know this idea was posted les...,QHR CORP,https://valueinvestorsclub.com/idea/QHR_CORP/1...,True,,,2016-08-23,2016-09-06,...,2208.73,2208.73,Information Technology Services,497.435897,3.23255,19.339538,146.158744,162.485049,Canada,3.322161
2,shoon1022,/member/shoon1022/23813,"\nDescription\n \nAt $2.80, FLWS has an enterp...",1-800-FLOWERS.COM,https://valueinvestorsclub.com/idea/1-800-FLOW...,True,3.4,3.8,2011-02-20,2011-03-06,...,1364.33,1539.79,Internet Retail,,0.268181,,217.2808,75.961198,United States,-0.393029


## 2. Data cleaning

In [53]:
del data['Author link']
del data['Idea']
del data['Idea URL']
del data['Two Year Date']
del data['Two Year Index Price']
del data['Two Year Price']

In [54]:
data['Submission Date'] = pd.to_datetime(data['Submission Date'])
data['One Year Date'] = pd.to_datetime(data['One Year Date'])

### 2.1 Text cleaning/pre-processing

In [55]:
"""
Preprocessing text and html (Tokenizing words and sentences, clean HTML, clean text, removing stopwords, stemming and lemmatization)
__author__ : Triskelion user@Kaggle (Thanks: Abhishek Thakur & Foxtrot user@Kaggle)
"""

# Tokenizing (Document to list of sentences. Sentence to list of words.)
def tokenize(str):
    '''Tokenizes into sentences, then strips punctuation/abbr, converts to lowercase and tokenizes words'''
    return     [word_tokenize(" ".join(re.findall(r'\w+', t,flags = re.UNICODE | re.LOCALE)).lower()) 
            for t in sent_tokenize(str.replace("'", ""))]

#Removing stopwords. Takes list of words, outputs list of words.
def remove_stopwords(l_words, lang='english'):
    l_stopwords = stopwords.words(lang)
    content = [w for w in l_words if w.lower() not in l_stopwords]
    return content
        
#Clean HTML / strip tags TODO: remove page boilerplate (find main content), support email, pdf(?)
def html2text(str):
    return clean_html(str)
        
#Stem all words with stemmer of type, return encoded as "encoding"
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
    supported_stemmers = ["PorterStemmer","SnowballStemmer","LancasterStemmer","WordNetLemmatizer"]
    if type is False or type not in supported_stemmers:
        return words_l
    else:
        l = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer": #TODO: context
            wnl = WordNetLemmatizer()
            for word in words_l:
                l.append(wnl.lemmatize(word).encode(encoding))
        return l

#The preprocess pipeline. Returns as lists of tokens or as string. If stemmer_type = False or not supported then no stemming.        
def preprocess_pipeline(str, lang="english", stemmer_type="PorterStemmer", return_as_str=False, 
                        do_remove_stopwords=False, do_clean_html=False):
    l = []
    words = []
    if do_clean_html:
        sentences = tokenize(html2text(str))
    else:
        sentences = tokenize(str)
    for sentence in sentences:
        if do_remove_stopwords:
            words = remove_stopwords(sentence, lang)
        else:
            words = sentence
        words = stemming(words, stemmer_type)
        if return_as_str:
            l.append(" ".join(words))
        else:
            l.append(words)
    if return_as_str:
        return " ".join(l)
    else:
        return l

In [57]:
data['Description'] = data['Description'].map(lambda x: x[12:].rstrip())
data['WordNet Desc'] = data['Description'].apply(lambda x: preprocess_pipeline(x, 
                                                                               stemmer_type='WordNetLemmatizer',
                                                                               do_remove_stopwords=True))


### 2.2 Create other features

In [58]:
#add relative market returns (Stock performance - index performance)
data['Year Return'] = (data['One Year Price']/data['Rating Price']) - 1
data['Year Index Return'] = (data['One Year Index Price']/data['Rating Index Price']) - 1
data['Outperformance'] = data['Year Return'] - data['Year Index Return']

#add length of write-up
data['Desc length'] = data['Description'].map(lambda x: len(x))

#add logged valuation metrics
def log_data(Series):
    data['log'+Series] = data[Series].map(lambda x: np.log(x))
    
log_data('PE')
log_data('Psales')
log_data('PFCF')
log_data('MKT')
log_data('EV')
log_data('ROIC')

#remove non-priced securities
data = data[data['Outperformance'].notnull()]
data.fillna(0,inplace=True) ##LOOK FOR BEST WAY TO FILL MISSING DATA!!!

In [59]:
#add market cap categories
bins = [0,800,8000,data['MKT'].max()]
names = ['Small Cap','Medium Cap','Large Cap']
data['MKT_category'] = pd.cut(data['MKT'], bins, labels=names)

In [60]:
#sentiment feature
from textblob import TextBlob
data['Sentiment_polarity'] = data['Description'].map(lambda x: TextBlob(x).sentiment.polarity)
data['Sentiment_subjectivity'] = data['Description'].map(lambda x: TextBlob(x).sentiment.subjectivity)

### 2.3 Create target classifier

In [61]:
#add market return classifier
#0 if underperformed market, 1 if outperformed market
def market_binary(x):
    if x <= 0:
        return 0
    elif x > 0:
        return 1
    else:
        return np.nan

data['Outperformed'] = data['Outperformance'].apply(market_binary)

In [62]:
data.to_excel('../assets/Preprocessed Data.xlsx')

In%20July%202012,%20MGMb%20filed%20a%20private%20S-1%20under%20the%20Jobs%20Act.%20No%20word%20yet%20on%20the%20timing%20of%20an%20IPO,%20but%20on%209/13/13,%20management%20adopted%20a%20poison%20pill%20and%20announced%20a%20$75mm%20buyback.  %20Each%20quarter%20the%20company%20posts%20recent%20financials%20to%20its%20web%20site%20and%20hosts%20a%20conference%20call.%20Management%20then%20removes%20the%20prior%20quarter%20financials%20and%20the%20replay%20of%20the%20previous%20call.
Background
Kirk%20Kerkorian%20has%20bought%20and%20sold%20the%20studio%20numerous%20times%20over%20the%20past%20thirty%20years. %20The%20company%20filed%20and%20then%20emerged%20from%20a%20pre-packaged%20bankruptcy%20in%202010. %20The%202010%20reorganization%20followed%20Kerkorian’s%202004%20sale%20of%20MGMb%20for%20$5bn%20to%20an%20LBO%20consortium%20backed%20by%20Sony,%20Comcast,%20TPG%20and%20Providence%20Equity%20Partners.%20While%20the%20company%20was%20in%20Chapter%2011,%20Carl%20Icahn%20made%20two%20