In [26]:
import pandas as pd
import time
import datetime
import csv
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
import text_hammer as th

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [27]:
reddit_file_path = 'r_wallstreetbets_posts.csv'

In [28]:
df = pd.read_csv(reddit_file_path)
columns_to_keep = ['title', 'created_utc'] 

df = df[columns_to_keep]
df['created_utc'] = pd.to_datetime(df['created_utc'],unit='s')


df['date'] = df['created_utc'].dt.date
df['time'] = df['created_utc'].dt.time
df = df.drop(columns=['created_utc'])
print(df.head())


  df = pd.read_csv(reddit_file_path)


                                               title        date      time
0                          Whats going on with PLTR?  2021-02-16  09:53:12
1  Need explanations on Level 2 data for GME, why...  2021-02-16  09:53:09
2       XRT is being used as a laundry short machine  2021-02-16  09:52:46
3                                          Airlines?  2021-02-16  09:52:45
4                                         Buy TRXC 🚀  2021-02-16  09:52:44


In [29]:
search_strings = ['tesla', 'TSLA', 'Tesla']

# Create a filter mask
mask = df['title'].str.contains('|'.join(search_strings), case=False, na=False)

# Filter the DataFrame
tesla_df = df[mask]


In [30]:
print(tesla_df['date'].min())
print(tesla_df['date'].max())

2013-05-09
2021-02-16


In [31]:
tesla_df.head()

Unnamed: 0,title,date,time
382,Trendies and Tesla’s are gone. Right as we wer...,2021-02-16,05:49:44
553,If you missed out on Mara and riot last year t...,2021-02-16,04:30:15
588,Tesla Should Go Bankrupt,2021-02-16,04:17:16
713,~$1mil loss Tesla pin risk over the weekend 88...,2021-02-16,03:28:43
723,Fisker FSR - High Level DD - Primed to be Tesl...,2021-02-16,03:23:51


In [32]:
start_date = pd.to_datetime('2019-01-01')
end_date = pd.to_datetime('2020-12-31')

# Filter the DataFrame
tesla_df = tesla_df[(tesla_df['date'] >= start_date) & (tesla_df['date'] <= end_date)]

# Display the first few rows of the filtered DataFrame
print(tesla_df.head())

                                                    title        date  \
579386                                 Thoughts on Tesla?  2020-12-31   
579399  Finally broke 200k right before year end. Next...  2020-12-31   
579430  PLTR soon to moon? 🚀. Here’s a comparison of T...  2020-12-31   
579477  How many of you rich Tesla people are waiting ...  2020-12-31   
579484             I have 10,000 to put into a Tesla call  2020-12-31   

            time  
579386  23:33:36  
579399  23:09:39  
579430  22:26:40  
579477  21:19:35  
579484  21:11:23  


  result = libops.scalar_compare(x.ravel(), y, op)


In [33]:
def text_preprocessing(df,col_name):
    #remove URL
    df['processed'] = df[col_name].str.replace(r'http(\S)+', r'')
    df['processed'] = df['processed'].str.replace(r'http ...', r'')
    df['processed'] = df['processed'].str.replace(r'http', r'')
    df[df['processed'].str.contains(r'http')]
   # remove RT, @
    df['processed'] = df['processed'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    df[df['processed'].str.contains(r'RT[ ]?@')]
    df['processed'] = df['processed'].str.replace(r'@[\S]+',r'')
    #remove non-ascii words and characters
    df['processed'] = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df['processed']]
    df['processed'] = df['processed'].str.replace(r'_[\S]?',r'')
    #remove &, < and >
    df['processed'] = df['processed'].str.replace(r'&amp;?',r'and')
    df['processed'] = df['processed'].str.replace(r'&lt;',r'<')
    df['processed'] = df['processed'].str.replace(r'&gt;',r'>')
    # remove extra space
    df['processed'] = df['processed'].str.replace(r'[ ]{2, }',r' ')
    # insert space between punctuation marks
    df['processed'] = df['processed'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df['processed'] = df['processed'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')
    # lower case and strip white spaces at both ends
    df['processed'] = df['processed'].str.lower()
    df['processed'] = df['processed'].str.strip()

    df['word_count'] = [len(text.split(' ')) for text in df['processed']]
    df['word_count'].value_counts()
    df = df[df['word_count']>3]
    df = df.drop_duplicates(subset=['processed'])

    return df

In [34]:
tesla_df = text_preprocessing(tesla_df,'title')

  df['processed'] = df[col_name].str.replace(r'http(\S)+', r'')
  df['processed'] = df['processed'].str.replace(r'http ...', r'')
  df['processed'] = df['processed'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
  df['processed'] = df['processed'].str.replace(r'@[\S]+',r'')
  df['processed'] = df['processed'].str.replace(r'_[\S]?',r'')
  df['processed'] = df['processed'].str.replace(r'&amp;?',r'and')
  df['processed'] = df['processed'].str.replace(r'[ ]{2, }',r' ')
  df['processed'] = df['processed'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
  df['processed'] = df['processed'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')


In [35]:

tesla_df = tesla_df.drop(columns='title')

In [36]:
tesla_df.head()

Unnamed: 0,date,time,processed,word_count
579386,2020-12-31,23:33:36,thoughts on tesla ?,4
579399,2020-12-31,23:09:39,finally broke 200k right before year end . nex...,68
579430,2020-12-31,22:26:40,pltr soon to moon ? . heres a comparison of ts...,26
579477,2020-12-31,21:19:35,how many of you rich tesla people are waiting ...,20
579484,2020-12-31,21:11:23,"i have 10 , 000 to put into a tesla call",11


In [37]:
output_file_path = 'processed_reddits.csv'

# Save the DataFrame to a CSV file
tesla_df.to_csv(output_file_path, index=False)

print(f"DataFrame has been saved to {output_file_path}")

DataFrame has been saved to processed_reddits.csv
