In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# twitter data cleaning
df=pd.read_csv('Tweets.csv')

In [3]:
df.head()

Unnamed: 0,Date,Tweets
0,2020-09-28,#NZDCAD Target Hit! 90 PIPS Profit 🔥\n\nFor Fo...
1,2020-09-28,#AAPL #AMZN Amazon Confirms Prime Day Coming O...
2,2020-09-28,1 hour chart for #AAPL $AAPL. Next target pri...
3,2020-09-28,EURNZD TP2 Hit! 180 PIPS PROFIT 🔥👍🏻⚡️\n\nGet D...
4,2020-09-28,EURNZD TP2 Hit! 180 PIPS PROFIT 🔥👍🏻⚡️\n\nJoin ...


In [4]:
# data cleaning and implementing bag of words
# converting into lower case
df['Tweets']=df['Tweets'].str.lower()
df.head()

Unnamed: 0,Date,Tweets
0,2020-09-28,#nzdcad target hit! 90 pips profit 🔥\n\nfor fo...
1,2020-09-28,#aapl #amzn amazon confirms prime day coming o...
2,2020-09-28,1 hour chart for #aapl $aapl. next target pri...
3,2020-09-28,eurnzd tp2 hit! 180 pips profit 🔥👍🏻⚡️\n\nget d...
4,2020-09-28,eurnzd tp2 hit! 180 pips profit 🔥👍🏻⚡️\n\njoin ...


In [5]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

## removing special characters

In [6]:
def remove_special_charater(df):
    for i in range(len(df['Tweets'])):
        df['Tweets'][i]=re.sub('[^ a-zA-Z0-9]', '', df['Tweets'][i])

In [7]:
remove_special_charater(df)

## converting sentences  into words

In [8]:
import nltk
def converting_into_words(df):
    for i in range(len(df['Tweets'])):
        df['Tweets'][i]=nltk.word_tokenize(df['Tweets'][i])

In [9]:
converting_into_words(df)

In [10]:
df['Tweets'][0]

['nzdcad',
 'target',
 'hit',
 '90',
 'pips',
 'profit',
 'for',
 'forex',
 'signals',
 'join',
 'httpstcojxmfy9z09kaapl',
 'btc',
 'forextrading',
 'chfjpy',
 'gbpjpy',
 'dax30',
 'audjpy',
 'audnzd',
 'forex',
 'forexmarket',
 'silver',
 'eurgbp',
 'stocks',
 'eurcad',
 'fx',
 'usoil',
 'brentoil',
 'bitcoin',
 'ukoil',
 'gold',
 'euraud',
 '24751',
 'httpstcozto5dcgobk']

In [11]:
# appling lemmatization and removing stop words
def stopword_remover_and_lemm(df):
    lemmatizer=WordNetLemmatizer()
    for i in range(len(df['Tweets'])):
        df['Tweets'][i]=[lemmatizer.lemmatize(word) for word in df['Tweets'][i] if word not in set(stopwords.words('english'))]
        #df['Tweets'][i]=''.join(df['Tweets'][i])
        

In [12]:
df.head()

Unnamed: 0,Date,Tweets
0,2020-09-28,"[nzdcad, target, hit, 90, pips, profit, for, f..."
1,2020-09-28,"[aapl, amzn, amazon, confirms, prime, day, com..."
2,2020-09-28,"[1, hour, chart, for, aapl, aapl, next, target..."
3,2020-09-28,"[eurnzd, tp2, hit, 180, pips, profit, get, dai..."
4,2020-09-28,"[eurnzd, tp2, hit, 180, pips, profit, join, no..."


In [13]:
df['Tweets'][0]

['nzdcad',
 'target',
 'hit',
 '90',
 'pips',
 'profit',
 'for',
 'forex',
 'signals',
 'join',
 'httpstcojxmfy9z09kaapl',
 'btc',
 'forextrading',
 'chfjpy',
 'gbpjpy',
 'dax30',
 'audjpy',
 'audnzd',
 'forex',
 'forexmarket',
 'silver',
 'eurgbp',
 'stocks',
 'eurcad',
 'fx',
 'usoil',
 'brentoil',
 'bitcoin',
 'ukoil',
 'gold',
 'euraud',
 '24751',
 'httpstcozto5dcgobk']

### Displaying the data with date and tweets, you can notice there are multiple tweets for each day. So we will club them together .

In [14]:
stopword_remover_and_lemm(df)

In [15]:
df

Unnamed: 0,Date,Tweets
0,2020-09-28,"[nzdcad, target, hit, 90, pip, profit, forex, ..."
1,2020-09-28,"[aapl, amzn, amazon, confirms, prime, day, com..."
2,2020-09-28,"[1, hour, chart, aapl, aapl, next, target, pri..."
3,2020-09-28,"[eurnzd, tp2, hit, 180, pip, profit, get, dail..."
4,2020-09-28,"[eurnzd, tp2, hit, 180, pip, profit, join, fre..."
5,2020-09-28,"[little, adjustment, signal, bot, im, excited,..."
6,2020-09-28,"[trade, idea, apple, inc, long, nasdaqaapl, lo..."
7,2020-09-28,"[eurnzd, tp2, hit, 180, pip, profit, free, sig..."
8,2020-09-28,"[rt, popeych, aapl, 15min, 12050, im, looking,..."
9,2020-09-28,"[optionsmike, topgunfp, something, rotten, aap..."


In [16]:
df['Tweets'][0]

['nzdcad',
 'target',
 'hit',
 '90',
 'pip',
 'profit',
 'forex',
 'signal',
 'join',
 'httpstcojxmfy9z09kaapl',
 'btc',
 'forextrading',
 'chfjpy',
 'gbpjpy',
 'dax30',
 'audjpy',
 'audnzd',
 'forex',
 'forexmarket',
 'silver',
 'eurgbp',
 'stock',
 'eurcad',
 'fx',
 'usoil',
 'brentoil',
 'bitcoin',
 'ukoil',
 'gold',
 'euraud',
 '24751',
 'httpstcozto5dcgobk']

In [19]:
# converting tweets into string and remove brackets
for i in range(len(df['Tweets'])):
    df['Tweets'][i]=str(df['Tweets'][i])

In [20]:
df.head()

Unnamed: 0,Date,Tweets
0,2020-09-28,"['nzdcad', 'target', 'hit', '90', 'pip', 'prof..."
1,2020-09-28,"['aapl', 'amzn', 'amazon', 'confirms', 'prime'..."
2,2020-09-28,"['1', 'hour', 'chart', 'aapl', 'aapl', 'next',..."
3,2020-09-28,"['eurnzd', 'tp2', 'hit', '180', 'pip', 'profit..."
4,2020-09-28,"['eurnzd', 'tp2', 'hit', '180', 'pip', 'profit..."


In [21]:
remove_special_charater(df)

In [22]:
df

Unnamed: 0,Date,Tweets
0,2020-09-28,nzdcad target hit 90 pip profit forex signal j...
1,2020-09-28,aapl amzn amazon confirms prime day coming oct...
2,2020-09-28,1 hour chart aapl aapl next target price 118 h...
3,2020-09-28,eurnzd tp2 hit 180 pip profit get daily forex ...
4,2020-09-28,eurnzd tp2 hit 180 pip profit join free signal...
5,2020-09-28,little adjustment signal bot im excited try to...
6,2020-09-28,trade idea apple inc long nasdaqaapl long h4 b...
7,2020-09-28,eurnzd tp2 hit 180 pip profit free signal join...
8,2020-09-28,rt popeych aapl 15min 12050 im looking bearish...
9,2020-09-28,optionsmike topgunfp something rotten aapl fed...


In [23]:
df=df.sort_values(by="Date")

In [25]:
df2=df.copy()

In [26]:
# converting str date into date time  objec
import datetime as dt
y,m,d=df2['Date'][0].split("-")
dt.datetime(int(y),int(m),int(d))

datetime.datetime(2020, 9, 28, 0, 0)

In [27]:
for i in range(len(df2['Date'])):
    y,m,d=df2['Date'][i].split("-")
    df2['Date'][i]=dt.datetime(int(y),int(m),int(d))

In [28]:
df2

Unnamed: 0,Date,Tweets
707,2020-09-19 00:00:00,rt ankitmadx recent high yesterday low pointna...
675,2020-09-19 00:00:00,make sense aapl stock tanking hold zero bitcoi...
676,2020-09-19 00:00:00,atampt apple need make bigger deal launch 5g i...
677,2020-09-19 00:00:00,spxtrades aapl would love 9933 add 82
678,2020-09-19 00:00:00,sell aapl chart httpstcoej84ijgndm
679,2020-09-19 00:00:00,aapl contact u receive entry exit signal aapl ...
680,2020-09-19 00:00:00,rt ai2stock stockmarket fell much last 3 weeks...
681,2020-09-19 00:00:00,stockmarket fell much last 3 weekssince sep 2 ...
682,2020-09-19 00:00:00,aapl 1d4h stochs 1d could bullish lt gt50 stlt...
683,2020-09-19 00:00:00,aapl 1d4h macd ingredient short stockspx https...


In [29]:
df2.head()

Unnamed: 0,Date,Tweets
707,2020-09-19 00:00:00,rt ankitmadx recent high yesterday low pointna...
675,2020-09-19 00:00:00,make sense aapl stock tanking hold zero bitcoi...
676,2020-09-19 00:00:00,atampt apple need make bigger deal launch 5g i...
677,2020-09-19 00:00:00,spxtrades aapl would love 9933 add 82
678,2020-09-19 00:00:00,sell aapl chart httpstcoej84ijgndm


In [30]:
df3=df2.copy()

In [31]:
df3.head()

Unnamed: 0,Date,Tweets
707,2020-09-19 00:00:00,rt ankitmadx recent high yesterday low pointna...
675,2020-09-19 00:00:00,make sense aapl stock tanking hold zero bitcoi...
676,2020-09-19 00:00:00,atampt apple need make bigger deal launch 5g i...
677,2020-09-19 00:00:00,spxtrades aapl would love 9933 add 82
678,2020-09-19 00:00:00,sell aapl chart httpstcoej84ijgndm


In [32]:
df3=df3.groupby(['Date'], as_index = False).agg({'Tweets': ','.join})

In [33]:
df3['Tweets'][0]

'rt ankitmadx recent high yesterday low pointnasdaq 14 dow 4nifty 3aapl 22 google 20 msft 15,make sense aapl stock tanking hold zero bitcoin balance sheet massive amount cash cash becoming worthless fast prestonpysh jeffbooth johnkvallis michaelsaylor,atampt apple need make bigger deal launch 5g iphone october watch stock price aapl httpstcoeoohewgaia httpstcoiuxzp4ueoe via pocketlint stockmarket iphone nasdaq100,spxtrades aapl would love 9933 add 82,sell aapl chart httpstcoej84ijgndm,aapl contact u receive entry exit signal aapl 13 profit join platinum trading room link biostockoptions stockoptionstrading stockoption stockmarket startinvesting stockmarkettips optionstrader optionstrading optiontrading httpstcosf9cvq03f2,rt ai2stock stockmarket fell much last 3 weekssince sep 2 26 9 11 nasdq nyse tsx stock fell 10,stockmarket fell much last 3 weekssince sep 2 26 9 11 nasdq nyse tsx stock fell 10 aapl fell 19 amzn facebook google fell 16 httpstcokhvbz3no7q,aapl 1d4h stochs 1d could bull

In [34]:
# removing commas
remove_special_charater(df3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [35]:
df3

Unnamed: 0,Date,Tweets
0,2020-09-19,rt ankitmadx recent high yesterday low pointna...
1,2020-09-20,daytradecoach sure study course trading im giv...
2,2020-09-21,rt kisstrade aapl really need break 117 level ...
3,2020-09-22,aapl amzn big tech going green httpstco1vpj8jz...
4,2020-09-23,usdcad target hit 150 pip profit free signal j...
5,2020-09-24,first thousand dollar day 1 hour work 26k week...
6,2020-09-25,apple finally start selling directly customer ...
7,2020-09-26,could resist share bear feasting aapl httpstco...
8,2020-09-27,rt sh4rizel trending alt blueprotocol low tota...
9,2020-09-28,rt popeych aapl 15min 12050 im looking bearish...


In [36]:
df3['Tweets'][0]

'rt ankitmadx recent high yesterday low pointnasdaq 14 dow 4nifty 3aapl 22 google 20 msft 15make sense aapl stock tanking hold zero bitcoin balance sheet massive amount cash cash becoming worthless fast prestonpysh jeffbooth johnkvallis michaelsayloratampt apple need make bigger deal launch 5g iphone october watch stock price aapl httpstcoeoohewgaia httpstcoiuxzp4ueoe via pocketlint stockmarket iphone nasdaq100spxtrades aapl would love 9933 add 82sell aapl chart httpstcoej84ijgndmaapl contact u receive entry exit signal aapl 13 profit join platinum trading room link biostockoptions stockoptionstrading stockoption stockmarket startinvesting stockmarkettips optionstrader optionstrading optiontrading httpstcosf9cvq03f2rt ai2stock stockmarket fell much last 3 weekssince sep 2 26 9 11 nasdq nyse tsx stock fell 10stockmarket fell much last 3 weekssince sep 2 26 9 11 nasdq nyse tsx stock fell 10 aapl fell 19 amzn facebook google fell 16 httpstcokhvbz3no7qaapl 1d4h stochs 1d could bullish lt g

In [37]:
df3.to_csv("cleaningTweets.csv",index=False)

# importing stock price data for AAPL from yahoo finanace

In [65]:
df4=pd.read_csv('AAPL_stock_data.csv')

In [39]:
df4

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2020-09-18,110.4,110.88,106.09,106.84,287104900,0,0
1,2020-09-21,104.54,110.19,103.1,110.08,195713800,0,0
2,2020-09-22,112.68,112.86,109.16,111.81,183055400,0,0
3,2020-09-23,111.62,112.11,106.77,107.12,150718700,0,0
4,2020-09-24,105.17,110.25,105.0,108.22,167743300,0,0
5,2020-09-25,108.43,112.44,107.67,112.28,149736300,0,0


In [44]:
# adding a price column in our data frame

df3["Prices"]=""

In [45]:
df3

Unnamed: 0,Date,Tweets,Prices
0,2020-09-19,rt ankitmadx recent high yesterday low pointna...,
1,2020-09-20,daytradecoach sure study course trading im giv...,
2,2020-09-21,rt kisstrade aapl really need break 117 level ...,
3,2020-09-22,aapl amzn big tech going green httpstco1vpj8jz...,
4,2020-09-23,usdcad target hit 150 pip profit free signal j...,
5,2020-09-24,first thousand dollar day 1 hour work 26k week...,
6,2020-09-25,apple finally start selling directly customer ...,
7,2020-09-26,could resist share bear feasting aapl httpstco...,
8,2020-09-27,rt sh4rizel trending alt blueprotocol low tota...,
9,2020-09-28,rt popeych aapl 15min 12050 im looking bearish...,


In [47]:
df4

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2020-09-18,110.4,110.88,106.09,106.84,287104900,0,0
1,2020-09-21,104.54,110.19,103.1,110.08,195713800,0,0
2,2020-09-22,112.68,112.86,109.16,111.81,183055400,0,0
3,2020-09-23,111.62,112.11,106.77,107.12,150718700,0,0
4,2020-09-24,105.17,110.25,105.0,108.22,167743300,0,0
5,2020-09-25,108.43,112.44,107.67,112.28,149736300,0,0


In [49]:
for i in range(len(df4['Date'])):
    y,m,d=df4['Date'][i].split("-")
    df4['Date'][i]=dt.datetime(int(y),int(m),int(d))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
df4

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2020-09-18 00:00:00,110.4,110.88,106.09,106.84,287104900,0,0
1,2020-09-21 00:00:00,104.54,110.19,103.1,110.08,195713800,0,0
2,2020-09-22 00:00:00,112.68,112.86,109.16,111.81,183055400,0,0
3,2020-09-23 00:00:00,111.62,112.11,106.77,107.12,150718700,0,0
4,2020-09-24 00:00:00,105.17,110.25,105.0,108.22,167743300,0,0
5,2020-09-25 00:00:00,108.43,112.44,107.67,112.28,149736300,0,0


In [62]:
df3=pd.read_csv('cleaningTweets.csv')

In [63]:
df3

Unnamed: 0,Date,Tweets
0,2020-09-19,rt ankitmadx recent high yesterday low pointna...
1,2020-09-20,daytradecoach sure study course trading im giv...
2,2020-09-21,rt kisstrade aapl really need break 117 level ...
3,2020-09-22,aapl amzn big tech going green httpstco1vpj8jz...
4,2020-09-23,usdcad target hit 150 pip profit free signal j...
5,2020-09-24,first thousand dollar day 1 hour work 26k week...
6,2020-09-25,apple finally start selling directly customer ...
7,2020-09-26,could resist share bear feasting aapl httpstco...
8,2020-09-27,rt sh4rizel trending alt blueprotocol low tota...
9,2020-09-28,rt popeych aapl 15min 12050 im looking bearish...


In [66]:
df4

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2020-09-18,110.4,110.88,106.09,106.84,287104900,0,0
1,2020-09-21,104.54,110.19,103.1,110.08,195713800,0,0
2,2020-09-22,112.68,112.86,109.16,111.81,183055400,0,0
3,2020-09-23,111.62,112.11,106.77,107.12,150718700,0,0
4,2020-09-24,105.17,110.25,105.0,108.22,167743300,0,0
5,2020-09-25,108.43,112.44,107.67,112.28,149736300,0,0


In [69]:
df=pd.merge(df3,df4,on=["Date"],how="left")

In [71]:
df.to_csv("Merge_Data.csv",index=False)

In [72]:
df2=pd.read_csv('Merge_Data.csv')

In [73]:
df2

Unnamed: 0,Date,Tweets,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2020-09-19,rt ankitmadx recent high yesterday low pointna...,,,,,,,
1,2020-09-20,daytradecoach sure study course trading im giv...,,,,,,,
2,2020-09-21,rt kisstrade aapl really need break 117 level ...,104.54,110.19,103.1,110.08,195713800.0,0.0,0.0
3,2020-09-22,aapl amzn big tech going green httpstco1vpj8jz...,112.68,112.86,109.16,111.81,183055400.0,0.0,0.0
4,2020-09-23,usdcad target hit 150 pip profit free signal j...,111.62,112.11,106.77,107.12,150718700.0,0.0,0.0
5,2020-09-24,first thousand dollar day 1 hour work 26k week...,105.17,110.25,105.0,108.22,167743300.0,0.0,0.0
6,2020-09-25,apple finally start selling directly customer ...,108.43,112.44,107.67,112.28,149736300.0,0.0,0.0
7,2020-09-26,could resist share bear feasting aapl httpstco...,,,,,,,
8,2020-09-27,rt sh4rizel trending alt blueprotocol low tota...,,,,,,,
9,2020-09-28,rt popeych aapl 15min 12050 im looking bearish...,,,,,,,
