In [1]:
import re # regular expressions
from tqdm import tnrange, tqdm_notebook, tqdm
import pandas as pd

In [2]:
CURRENCY = "bitcoin"
CURRENCY_SYMBOL = "BTC"
tweets_raw_file = f'data/twitter/{CURRENCY_SYMBOL}/{CURRENCY}_tweets_raw.csv'

In [8]:
d = pd.read_csv(tweets_raw_file)

In [11]:
d1 = d.drop(['ID','UserName','UserFollowerCount','RetweetCount','Likes','CreatedAt'],axis=1)
d1 = d1.drop(0)

In [22]:
d1= d1.reset_index(drop=True)

In [15]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner_updated(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.replace(u"ï¿½", "?")
        #bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        print("failed")
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

testing = d1.Text[:100]
test_result = []
for t in testing:
    test_result.append(tweet_cleaner_updated(t))
#test_result

In [23]:
nums = [0,d1.shape[0]]
print("Cleaning and parsing the tweets...\n")
clean_tweet_texts = []
for i in range(nums[0],nums[1]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                    
    clean_tweet_texts.append(tweet_cleaner_updated(d1['Text'][i]))

Cleaning and parsing the tweets...

Tweets 10000 of 43250 has been processed
Tweets 20000 of 43250 has been processed
Tweets 30000 of 43250 has been processed
Tweets 40000 of 43250 has been processed


In [43]:
clean_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_df.head()

Unnamed: 0,text
0,become bitcoin developer basic via
1,btc buying pressure alert price trading around...
2,rt embarrassed to say shorted bitcoin yesterda...
3,rt hpt price usdt hpt trading pairs hpt volume...
4,rt ll give btc or paypal to someone who retwee...


In [44]:
clean_df['target']=0

In [45]:
tweets_clean_file = f'data/twitter/{CURRENCY_SYMBOL}/{CURRENCY}_tweets_clean.csv'

In [46]:
clean_df.to_csv(tweets_clean_file,encoding='utf-8')

In [47]:
data = pd.read_csv(tweets_clean_file,index_col=0)

In [48]:
data.head()

Unnamed: 0,text,target
0,become bitcoin developer basic via,0
1,btc buying pressure alert price trading around...,0
2,rt embarrassed to say shorted bitcoin yesterda...,0
3,rt hpt price usdt hpt trading pairs hpt volume...,0
4,rt ll give btc or paypal to someone who retwee...,0


In [7]:
# importing required libraries
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row

# initializing spark session
sc = SparkContext(appName="PySparkShell")
spark = SparkSession(sc)

In [8]:
my_schema = tp.StructType([
  tp.StructField(name= 'ID',          dataType= tp.StringType(),  nullable= True),
  tp.StructField(name= 'Text',       dataType= tp.StringType(),  nullable= True),
  tp.StructField(name= 'UserName',       dataType= tp.StringType(),   nullable= True),
  tp.StructField(name= 'UserFollowerCount',          dataType= tp.IntegerType(),  nullable= True),
  tp.StructField(name= 'RetweetCount',          dataType= tp.IntegerType(),  nullable= True),
  tp.StructField(name= 'Likes',          dataType= tp.IntegerType(),  nullable= True),
  tp.StructField(name= 'CreatedAt',          dataType= tp.StringType(),  nullable= True),
])

my_data = spark.read.csv(tweets_clean_file,
                         schema=my_schema,
                         header=True)
#view the data
my_data.show(5)

#print the schema of the file
my_data.printSchema()

+-------------------+--------------------+------------+-----------------+------------+-----+--------------------+
|                 ID|                Text|    UserName|UserFollowerCount|RetweetCount|Likes|           CreatedAt|
+-------------------+--------------------+------------+-----------------+------------+-----+--------------------+
|               null|                null|        null|             null|        null| null|                null|
|1256595810265051136|Become A Bitcoin ...| Naked Wicks|               22|           0|    0|Sat May 02 14:46:...|
|1256595809077874688|⬆️⬆️ $BTC BUYING ...|CryptoSquawk|              284|           0|    0|Sat May 02 14:46:...|
|1256595805978284032|RT : I'm embarras...|    kjvbbnnn|                0|           5|    0|Sat May 02 14:46:...|
|1256595802736046081|RT : HPT Price US...|         kim|               67|          27|    0|Sat May 02 14:46:...|
+-------------------+--------------------+------------+-----------------+------------+--

In [9]:
stage_1 = RegexTokenizer(inputCol= 'Text' , outputCol= 'tokens', pattern= '\\W')
# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')
# define stage 3: create a word vector of the size 100
stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 100)
# define stage 4: Logistic Regression Model
model = LogisticRegression(featuresCol= 'vector', labelCol= 'label')