# Split the clean.csv file into multiple files. Compute VADER sentiment and score

In [128]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm

# Define the currency
#CURRENCY = "zilliqa"
#CURRENCY_SYMBOL = "ZIL"
CURRENCY = "nexo"
CURRENCY_SYMBOL = "NEXO"
#CURRENCY = "bitcoin"
#CURRENCY_SYMBOL = "BTC"
tweets_clean_file = f'data/twitter/{CURRENCY_SYMBOL}/{CURRENCY}_tweets_clean.csv'
path = f'data/twitter/{CURRENCY_SYMBOL}' #/{CURRENCY}_tweets_clean.csv'


## Read the cleaned file

In [129]:
df_clean = pd.read_csv(tweets_clean_file)
print(df_clean.shape)
df_clean.head(5)

(2576, 7)


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt
0,1001426024406634497,RT : Nexo Maersk inbound APM Terminal Valencia...,Heric ♥‿♥ Montalvo,434,1,0,Tue May 29 11:32:24 +0000 2018
1,1001425728255266816,$NEXO is a good project and a good buy at thes...,Trader Afrique,229,0,2,Tue May 29 11:31:13 +0000 2018
2,1001425151039160320,eBay: New &amp; Sealed LEGO Nexo Knights King'...,US LEGO Set Sales,150,0,0,Tue May 29 11:28:56 +0000 2018
3,1001424956675129349,RT : Nexo Community is now the largest crypto ...,Nanda Dwi Harto,3,571,0,Tue May 29 11:28:09 +0000 2018
4,1001424234139222016,RT : Cryptune retirement bags 🎲💜💎🌊🎲$DBIX - Pal...,Crypto_Huey,84,124,0,Tue May 29 11:25:17 +0000 2018


In [130]:
df_clean = df_clean.sort_values(by='ID') # the bigger the ID, the most recent the tweet 

## Sentiment analysis with Vader

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media.

VADER takes into account 
- negations and contractions (not good, wasn’t good)
- Ponctuation (good!!!), CAPS, emotes :), emojis 
- Intensificators (very, kind of), acronyms ‘lol’
- Scores between -1.0 (negative) and 1.0 (positive)

We will use this sentiment analysis of the tweets to calculate a score that will represent the importance of each tweet.

In [131]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(df_clean['Text'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
df_clean["compound"] = compound
df_clean.head(2)

100%|██████████| 2576/2576 [00:00<00:00, 3737.68it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound
2575,997843451952615424,RT : Nexo Community is now the largest crypto ...,Riasat Ali,165,571,0,Sat May 19 14:16:32 +0000 2018,0.8356
2574,997847715433385992,RT : best experience in the crypto-space so fa...,CCrypto,34,45,0,Sat May 19 14:33:29 +0000 2018,0.6988


## Calculate a score for each tweet

To calculate the score for each tweet, we use different variables to which we had a weight based on its importance.

The compound column represents the sentiment of the tweets and its value is between -1 and 1.

We also use the number of retweets, the number of likes, and the number of users that follow the tweet's author.

In [132]:
scores = []
for i, s in tqdm(df_clean.iterrows(), total=df_clean.shape[0]):
    scores.append(s["compound"] * ((s["UserFollowerCount"]+1)) * ((s["Likes"]+1)))
df_clean["score"] = scores
df_clean.head(2)

100%|██████████| 2576/2576 [00:00<00:00, 13004.51it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound,score
2575,997843451952615424,RT : Nexo Community is now the largest crypto ...,Riasat Ali,165,571,0,Sat May 19 14:16:32 +0000 2018,0.8356,138.7096
2574,997847715433385992,RT : best experience in the crypto-space so fa...,CCrypto,34,45,0,Sat May 19 14:33:29 +0000 2018,0.6988,24.458


## Split dataframe and save it into multiple files

In [133]:
from datetime import datetime

In [134]:
n = 20000  #chunk row size
chunks_df = [df_clean[i:i+n] for i in range(0,df_clean.shape[0],n)]

sep_char = '~'
for chunk_df in chunks_df:
    chunk_min = chunk_df['ID'].min()
    chunk_max = chunk_df['ID'].max()
    date_from = (datetime.strptime(chunk_df.iloc[0]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    date_to = (datetime.strptime(chunk_df.iloc[-1]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    print(date_from, date_to)

    # Write into csv
    chunk_df.to_csv(f"{path}/{date_from}{sep_char}{date_to}.csv", header=True, index=False)
    


2018-05-19 14-16-32 2018-05-29 11-32-24


## Update var.csv

In [135]:
import glob
import numpy as np

ENVS = ['CRYPTO', 'LINE_COUNT', 'MOST_RECENT_FILE', 'MOST_RECENT_ID'] # Stored in var.csv

def get_var(key, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',',
                         dtype={'LINE_COUNT': np.int32})
    return df_var[key].loc[df_var['CRYPTO'] == crypto].values[0]

def update_var(key, value, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',',
                         dtype={'LINE_COUNT': np.int32})
    df_var[key].loc[df_var['CRYPTO'] == crypto] = str(value)
    df_var.to_csv("var.csv", index=False)
    
def add_new_crypto(crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',',
                         dtype={'LINE_COUNT': np.int32})
    if df_var[ENVS[0]].loc[df_var['CRYPTO'] == crypto].empty:
        new_line = pd.DataFrame([[crypto,-1,"",0]], columns=ENVS)
        df_var = df_var.append(new_line)
        df_var.to_csv("data/twitter/var.csv", index=False)

In [137]:
files = glob.glob(f"{path}/*~*.csv.csv")
files = sorted(files)
last_file = files[-1]
print(last_file)
last_df = pd.read_csv(last_file)
last_elem = last_df.tail(1)
print(last_elem['ID'])
print(last_df.shape)

add_new_crypto(CURRENCY_SYMBOL)
update_var(ENVS[1], last_df.shape[0], CURRENCY_SYMBOL)
update_var(ENVS[2], last_file, CURRENCY_SYMBOL)
update_var(ENVS[3], last_elem, CURRENCY_SYMBOL)

data/twitter/NEXO/2018-05-19 14-16-32~2018-05-29 11-32-24.csv
2575    1001426024406634497
Name: ID, dtype: int64
(2576, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [138]:
var_df = pd.read_csv('data/twitter/var.csv')
var_df

Unnamed: 0,CRYPTO,LINE_COUNT,MOST_RECENT_FILE,MOST_RECENT_ID
0,BTC,556,data/twitter/BTC/2018-05-29 12-20-53~2018-05-2...,1001439557504692224
1,NEXO,2576,data/twitter/NEXO/2018-05-19 14-16-32~2018-05-...,1001426024406634497
2,ZIL,14917,data/twitter/ZIL/2018-05-19 13-38-38~2018-05-2...,1001428255268917250
