## Create connection with Mongo

In [None]:
import pandas as pd
import numpy as np
import pymongo
from pymongo import MongoClient
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')

In [None]:
client = MongoClient('localhost', 27017)

### Client is the database
db = client['StockTwitClass101']

## Pipeline To create Sentiment

Related paper: [Intraday online investor sentiment and return patterns in the U.S.
stock market](https://docs.google.com/file/d/1L8bS8vNTXS-HWToP4zMpqfT308n-LxZb/edit)

L1/L2 Lexicon: [here](http://www.thomas-renault.com/data.php)

A) Create a Function to prepare the data
    
    1. Keep only twit with sentiment either `Bullish` or `Bearish` and remove multiple stock twits
   
    2. take negation into account, we add the prefix "negtag_" to all words following "not","no","none","neither","never" or “nobody”
    
    3. Convert digit to "_digit"
    
    4. Remove when mention a user
    
    5. lemmatize corpus
    
    6. Prepare train/test set
    
B) Build the Vectorization
C) Construct the Naive classifier
D) Predict out of sample

### note about Lemmatization

Lemmatization reduces words to their base word, which is linguistically correct lemmas. It transforms root word with the use of vocabulary and morphological analysis. Lemmatization is usually more sophisticated than stemming. Stemmer works on an individual word without knowledge of the context. For example, The word "better" has "good" as its lemma. This thing will miss by stemming because it requires a dictionary look-up


### Create a Function to prepare the data

Step : 1
       
       - Exclude multi tickers

Step : 2
       
       - take negation into account:
       
       - "not","no","none","neither","never" or “nobody”

Step : 3
       
       - Convert digit to "_digit"

Step : 4
        
       - Remove @USER

Step : 5
       
       - Remove unicode issue
        
Step 6: Lemmanize


In [None]:
def metatransformation(query, to_train=True):
    """
    Arguments:
    Query: MongoDB query 
    to_train:  True: return a train and test dataset
    False: return only data to predict out of sample
    
    Step : 1
        - Exclude multi tickers

    Step : 2
        - take negation into account:
        - "not","no","none","neither","never" or “nobody”

    Step : 3
        - Convert digit to "_digit"

    Step : 4
        - Remove @USER

    Step : 5
        - Remove unicode issue

    Step 6: Lemmanize


    """

    text = pd.DataFrame(list(db.messages.find(query)))

    # Count stock

    text["count_stock"] = text["symbols"].apply(lambda x: len(x))

    # Extract single count

    text = text[text["count_stock"].isin([1])]

    # text = df.copy()

    # take negation into account
    text["body_transform"] = text["body"].replace(
        regex={
            r"\bnothing\b": "nothing_negword",
            r"\bno\b": "no_negword",
            r"\bnone\b": "none_negword",
            r"\bneither\b": "neither_negword",
            r"\bnever\b": "never_negword",
            r"\bnobody\b": "nobody_negword",
        }
    )

    # Convert digit to "_digit"
    # Remove @USER
    # Remove unicode issue
    # Remove ticker
    # Remove all the special characters
    # remove all single characters
    # Remove Ya
    # Remove bitcoin
    # remove btc

    text["body_transform"] = text["body_transform"].replace(
        regex={
            r"\d+": "isDigit",
            r"([@?])(\w+)\b": "user",
            r"\b&#\b": " ",
            r"[$][A-Za-z][\S]*": "",
            r"\W": " ",
            r"\s+[a-zA-Z]\s+": " ",
            r"\^[a-zA-Z]\s+": " ",
            r"\s+": " ",
            r"^b\s+": "",
            r"\bya\b": "",
            r"\bbitcoin\b": "",
            r"\bBitcoin\b": "",
            r"\bbtc\b": "",

        }
    )

    # Lower

    text["body_transform"] = text["body_transform"].str.lower()

    # Remove stop words

    stop = stopwords.words('english')

    text["body_transform"] = text["body_transform"].apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    # Lemmatize

    lemmatizer = WordNetLemmatizer()
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

    text["body_transform"] = text["body_transform"].apply(
        lambda x: " ".join([lemmatizer.lemmatize(w)
                            for w in w_tokenizer.tokenize(x)])
    )

    # Split the dataset

    X_ = text["body_transform"]
    y_ = text["sentiment_"]

    count_ = text.groupby("sentiment")["sentiment"].count()

    print("The shape of the data is {}, and {}".format(text.shape, count_))

    if to_train:
        X_train, X_test, y_train, y_test = train_test_split(
            X_, y_, test_size=0.1, random_state=0
        )

        return X_train, X_test, y_train, y_test

    else:

        return X_


## Pipeline step

This step includes:

- Build the Vectorization
- Construct the Naive classifier

Example of stop words

In [None]:
stopwords.words('english')[:10]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features=1500,
                             min_df=10,
                             max_df=0.7)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
 ])


Create the first transformation of the data

In [None]:
query ={
    "sentiment":{ "$ne": "Neutral" }
}

X_train, X_test, y_train, y_test = metatransformation(query = query)

Quick stat descriptive

In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

In [None]:
Word_tokenize = X_train.apply(word_tokenize) 
### Need to flatten the list
flattened_list = [y for x in Word_tokenize.tolist() for y in x]
fdist = FreqDist(flattened_list)
fdist.plot(30,cumulative=False)
plt.show()

In [None]:
y_train.reset_index().groupby('sentiment_')['sentiment_'].count()

In [None]:
def plot_keyword_sentiment(df, nbKeyword= 10):
    """
    Plot the distribution of sentiments by keyword
    """
    
    df_fdist = pd.DataFrame.from_dict(df, orient='index')
    df_fdist.columns = ['Frequency']
    df_fdist.index.name = 'Term'
    df_fdist =df_fdist.sort_values(by = 'Frequency', ascending = False)
    
    ### 
    
    df_top_sent = pd.DataFrame()
    for key in df_fdist.head(nbKeyword).index:

        count_sentiment = (
            pd.concat([X_train[X_train.str.contains(key)],
                             y_train], axis = 1, join = 'inner')
            .groupby('sentiment_')['body_transform']
            .count()
            .reset_index()
        )
        count_sentiment['keyword'] = key
        df_top_sent = df_top_sent.append(count_sentiment)
    df_top_sent = df_top_sent.pivot(index='keyword',
                  columns='sentiment_',
                  values='body_transform')
    df_top_sent['sum'] = df_top_sent.apply(lambda x: x.sum(), axis = 1)
    df_top_sent.sort_values(by = 'sum').drop(columns = 'sum').plot.barh(stacked=True)

In [None]:
plot_keyword_sentiment(df = fdist, nbKeyword= 10)

## Bigrams

Definitelly needs to clean more the corpus..

In [None]:
bgs = nltk.ngrams(flattened_list, 2)

fdist = nltk.FreqDist(bgs)

fdist.plot(30,cumulative=False)
plt.show()

## Fit the model

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
y_pred = text_clf.predict(X_train)
y_pred[:10]

In [None]:
predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test,
                                    predicted))

In [None]:
metrics.confusion_matrix(y_test, predicted)

## Predict out of sample

In [None]:
query ={
    "sentiment":"Neutral" 
}
X_predict = metatransformation(query = query,
                               to_train = False)

In [None]:
predicted = text_clf.predict(X_predict)

In [None]:
pd.concat([pd.Series(X_predict, name = 'body').reset_index(),
          pd.Series(predicted, name = 'predict')], axis = 1)

## Get Bitcoins Data

Extracted from [Quandl](https://www.quandl.com/data/BCHAIN/MKPRU-Bitcoin-Market-Price-USD)

In [None]:
import quandl
quandl.ApiConfig.api_key = "gs_J3domJb8kT6WjLz9s"

In [None]:
bitcoin = quandl.get("BCHAIN/MKPRU")
bitcoin['returns'] = bitcoin.pct_change(1)
bitcoin.head()

In [None]:
bitcoin['Value'].plot(title='Values of Bitcoins')


In [None]:
bitcoin['returns'].dropna().plot(title='Returns of Bitcoins')

## Daily aggregated sentiment

Compute the daily average

In [None]:
query = {"sentiment": {"$ne": "Neutral"}}
text = pd.DataFrame(list(db.messages.find(query)))
text["created_at"] = pd.to_datetime(text["created_at"], infer_datetime_format=True)
text = (text
        .set_index("created_at")
        .drop(columns="id")
        .resample("D")
        .mean()
       )

In [None]:
timeseries = pd.concat([text, bitcoin], axis = 1, join="inner")

timeseries.head()

## Granger test

Test the Granger Causality between sentiment on social media and stock returns

### How does Granger causality test work?

It is based on the idea that if X causes Y, then the forecast of Y based on previous values of Y AND the previous values of X should outperform the forecast of Y based on previous values of Y alone.

According to Statsmodels 

The Null hypothesis for `grangercausalitytests` is that the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test.

The null hypothesis for all four test is that the coefficients corresponding to past values of the second time series are zero.

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests

In [None]:
grangercausalitytests(timeseries[['returns', 'sentiment_']], maxlag=4)

## Regress

$$r_{i, t}=\alpha+\beta_{1} \Delta s_{1, t}+\beta_{2} \Delta s_{i, t-1}+\epsilon_{t}$$


In [None]:
timeseries['sentiment_lag'] = timeseries['sentiment_'].shift(1)

In [None]:
timeseries['L_s1'] = timeseries['sentiment_'].pct_change(1)
timeseries['L_s2'] = timeseries['sentiment_lag'].pct_change(1)
timeseries.head()

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
mod1 = smf.ols(formula='returns ~ L_s1 + L_s2', 
               data=timeseries).fit()
mod1.summary()

## test Lexicon L1

In [None]:
L1 = pd.read_csv('http://www.thomas-renault.com/l1_lexicon.csv', sep = ";")
L1.sort_values(by = 'keyword').head()

In [None]:
Word_tokenize = X_train.apply(word_tokenize) 

# Appendix: Details steps & analytics

## Text Analysis Operations using NLTK

We use the full set with Bullish and Bearish 

### Tokenise pandas series

### TF-IDF:  Our approach

As explained in the previous post, the tf-idf vectorization of a corpus of text documents assigns each word in a document a number that is proportional to its frequency in the document and inversely proportional to the number of documents in which it occurs

TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:

IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

Compute the IDFs

Compute the TFIDF score

The higher the TF*IDF score (weight), the rarer the term and vice versa.