In [2]:
import pandas as pd
import time
import datetime
import csv

In [3]:
reddit_file_path = 'r_wallstreetbets_posts.csv'

In [4]:
df = pd.read_csv(reddit_file_path)
columns_to_keep = ['title', 'created_utc'] 

df = df[columns_to_keep]
df['created_utc'] = pd.to_datetime(df['created_utc'],unit='s')


df['date'] = df['created_utc'].dt.date
df['time'] = df['created_utc'].dt.time
df = df.drop(columns=['created_utc'])
print(df.head())


  df = pd.read_csv(reddit_file_path)


                                               title        date      time
0                          Whats going on with PLTR?  2021-02-16  09:53:12
1  Need explanations on Level 2 data for GME, why...  2021-02-16  09:53:09
2       XRT is being used as a laundry short machine  2021-02-16  09:52:46
3                                          Airlines?  2021-02-16  09:52:45
4                                         Buy TRXC ðŸš€  2021-02-16  09:52:44


In [5]:
search_strings = ['tesla', 'TSLA', 'Tesla']

# Create a filter mask
mask = df['title'].str.contains('|'.join(search_strings), case=False, na=False)

# Filter the DataFrame
tesla_df = df[mask]


In [6]:
print(tesla_df['date'].min())
print(tesla_df['date'].max())

2013-05-09
2021-02-16


In [7]:
tesla_df.head()

Unnamed: 0,title,date,time
382,Trendies and Teslaâ€™s are gone. Right as we wer...,2021-02-16,05:49:44
553,If you missed out on Mara and riot last year t...,2021-02-16,04:30:15
588,Tesla Should Go Bankrupt,2021-02-16,04:17:16
713,~$1mil loss Tesla pin risk over the weekend 88...,2021-02-16,03:28:43
723,Fisker FSR - High Level DD - Primed to be Tesl...,2021-02-16,03:23:51


In [8]:
start_date = pd.to_datetime('2019-01-01')
end_date = pd.to_datetime('2021-02-15')

# Filter the DataFrame
tesla_df = tesla_df[(tesla_df['date'] >= start_date) & (tesla_df['date'] <= end_date)]

# Display the first few rows of the filtered DataFrame
print(tesla_df.head())

                                                  title        date      time
1460                Can Nio beat Tesla in a few weeks??  2021-02-15  22:22:58
1467  A Huge Fund Bought Tesla, Apple, and Microsoft...  2021-02-15  22:20:02
1539                   $TRCH doing Business with $TSLA?  2021-02-15  22:00:56
1604                  $TSLA Trading Short via ShortAlgo  2021-02-15  21:32:43
1669  $FNMA and $FMCC going to explode soon. Compare...  2021-02-15  21:10:53


  result = libops.scalar_compare(x.ravel(), y, op)


In [9]:
def text_preprocessing(df,col_name):
    #remove URL
    df['processed'] = df[col_name].str.replace(r'http(\S)+', r'')
    df['processed'] = df['processed'].str.replace(r'http ...', r'')
    df['processed'] = df['processed'].str.replace(r'http', r'')
    df[df['processed'].str.contains(r'http')]
   # remove RT, @
    df['processed'] = df['processed'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    df[df['processed'].str.contains(r'RT[ ]?@')]
    df['processed'] = df['processed'].str.replace(r'@[\S]+',r'')
    #remove non-ascii words and characters
    df['processed'] = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df['processed']]
    df['processed'] = df['processed'].str.replace(r'_[\S]?',r'')
    #remove &, < and >
    df['processed'] = df['processed'].str.replace(r'&amp;?',r'and')
    df['processed'] = df['processed'].str.replace(r'&lt;',r'<')
    df['processed'] = df['processed'].str.replace(r'&gt;',r'>')
    # remove extra space
    df['processed'] = df['processed'].str.replace(r'[ ]{2, }',r' ')
    # insert space between punctuation marks
    df['processed'] = df['processed'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df['processed'] = df['processed'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')
    # lower case and strip white spaces at both ends
    df['processed'] = df['processed'].str.lower()
    df['processed'] = df['processed'].str.strip()

    df['word_count'] = [len(text.split(' ')) for text in df['processed']]
    df['word_count'].value_counts()
    df = df[df['word_count']>3]
    df = df.drop_duplicates(subset=['processed'])

    return df

In [10]:
tesla_df = text_preprocessing(tesla_df,'title')

  df['processed'] = df[col_name].str.replace(r'http(\S)+', r'')
  df['processed'] = df['processed'].str.replace(r'http ...', r'')
  df['processed'] = df['processed'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
  df['processed'] = df['processed'].str.replace(r'@[\S]+',r'')
  df['processed'] = df['processed'].str.replace(r'_[\S]?',r'')
  df['processed'] = df['processed'].str.replace(r'&amp;?',r'and')
  df['processed'] = df['processed'].str.replace(r'[ ]{2, }',r' ')
  df['processed'] = df['processed'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
  df['processed'] = df['processed'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')


In [11]:

tesla_df = tesla_df.drop(columns='title')

In [12]:
tesla_df.head()

Unnamed: 0,date,time,processed,word_count
1460,2021-02-15,22:22:58,can nio beat tesla in a few weeks ??,9
1467,2021-02-15,22:20:02,"a huge fund bought tesla , apple , and microso...",17
1539,2021-02-15,22:00:56,$ trch doing business with $ tsla ?,8
1604,2021-02-15,21:32:43,$ tsla trading short via shortalgo,6
1669,2021-02-15,21:10:53,$ fnma and $ fmcc going to explode soon . comp...,23


In [13]:
output_file_path = 'processed_reddits.csv'

# Save the DataFrame to a CSV file
tesla_df.to_csv(output_file_path, index=False)

print(f"DataFrame has been saved to {output_file_path}")

DataFrame has been saved to processed_reddits.csv
