# 3. Data Processing & EDA

In [1]:
# import packages
import pandas as pd
import re
import string
import nltk

from sklearn.feature_extraction.text import CountVectorizer

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

In [2]:
# read csv files
crypto = pd.read_csv("../data/crypto_clean.csv")
stocks = pd.read_csv("../data/stocks_clean.csv")

In [3]:
# append crypto dataset to stocks dataset
data = stocks.append(crypto)

In [4]:
data.dtypes

subreddit      object
selftext       object
title          object
created_utc     int64
dtype: object

In [5]:
data.head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,stocks,I am earning very little at the moment but I w...,Advise on Long Term Stock?,1626851004
1,stocks,"The stocks I chose were aapl, net, asts, icln,...",Dad told me to sell on Monday when the market ...,1626847423
2,stocks,Retail owns the companies so it could happen i...,Merger/accusation question about a game co. An...,1626846017
3,stocks,"Hi,\n\nI'm looking for the best software to tr...",Best Software to Track Trades and Create Reports,1626845812
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...,"After Driving My Friends Tesla, Here Are My Th...",1626840162


In [6]:
data.tail()

Unnamed: 0,subreddit,selftext,title,created_utc
1695,CryptoCurrency,I’ve started seeing posts on here acknowledgin...,"“Bull Markets Can Make You Money, Bear Markets...",1626623199
1696,CryptoCurrency,Let me get started.\n\nIt was after the Snowde...,What made you get into crypto?,1626622781
1697,CryptoCurrency,I have been thinking about this for sometime a...,How to introduce crypto to the people who have...,1626622046
1698,CryptoCurrency,I'm curious as to how mining and the price of ...,Mining crypto,1626621863
1699,CryptoCurrency,"Hey guys, \n\nSo the post a few days ago about...",Buying Crypto with Reward Points or Gift Cards?,1626621433


In [7]:
# reset index for appended dataset
data.reset_index(drop=True)

Unnamed: 0,subreddit,selftext,title,created_utc
0,stocks,I am earning very little at the moment but I w...,Advise on Long Term Stock?,1626851004
1,stocks,"The stocks I chose were aapl, net, asts, icln,...",Dad told me to sell on Monday when the market ...,1626847423
2,stocks,Retail owns the companies so it could happen i...,Merger/accusation question about a game co. An...,1626846017
3,stocks,"Hi,\n\nI'm looking for the best software to tr...",Best Software to Track Trades and Create Reports,1626845812
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...,"After Driving My Friends Tesla, Here Are My Th...",1626840162
...,...,...,...,...
3395,CryptoCurrency,I’ve started seeing posts on here acknowledgin...,"“Bull Markets Can Make You Money, Bear Markets...",1626623199
3396,CryptoCurrency,Let me get started.\n\nIt was after the Snowde...,What made you get into crypto?,1626622781
3397,CryptoCurrency,I have been thinking about this for sometime a...,How to introduce crypto to the people who have...,1626622046
3398,CryptoCurrency,I'm curious as to how mining and the price of ...,Mining crypto,1626621863


In [8]:
# drop 'created_utc' column
data.drop(['created_utc'], inplace=True, axis=1)

data.head()

Unnamed: 0,subreddit,selftext,title
0,stocks,I am earning very little at the moment but I w...,Advise on Long Term Stock?
1,stocks,"The stocks I chose were aapl, net, asts, icln,...",Dad told me to sell on Monday when the market ...
2,stocks,Retail owns the companies so it could happen i...,Merger/accusation question about a game co. An...
3,stocks,"Hi,\n\nI'm looking for the best software to tr...",Best Software to Track Trades and Create Reports
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...,"After Driving My Friends Tesla, Here Are My Th..."


In [9]:
# concatenate "title" and "selftext" columns
data["title+selftext"] = data["title"] + " " + data["selftext"]

In [10]:
data.head()

Unnamed: 0,subreddit,selftext,title,title+selftext
0,stocks,I am earning very little at the moment but I w...,Advise on Long Term Stock?,Advise on Long Term Stock? I am earning very l...
1,stocks,"The stocks I chose were aapl, net, asts, icln,...",Dad told me to sell on Monday when the market ...,Dad told me to sell on Monday when the market ...
2,stocks,Retail owns the companies so it could happen i...,Merger/accusation question about a game co. An...,Merger/accusation question about a game co. An...
3,stocks,"Hi,\n\nI'm looking for the best software to tr...",Best Software to Track Trades and Create Reports,Best Software to Track Trades and Create Repor...
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...,"After Driving My Friends Tesla, Here Are My Th...","After Driving My Friends Tesla, Here Are My Th..."


In [11]:
# reassign dataset with only "subreddit" and "title+selftext" columns
data = data[["subreddit","title+selftext"]]

In [12]:
data.head()

Unnamed: 0,subreddit,title+selftext
0,stocks,Advise on Long Term Stock? I am earning very l...
1,stocks,Dad told me to sell on Monday when the market ...
2,stocks,Merger/accusation question about a game co. An...
3,stocks,Best Software to Track Trades and Create Repor...
4,stocks,"After Driving My Friends Tesla, Here Are My Th..."


In [13]:
#save processed dataset to csv
data.to_csv('../data/processed_data.csv',index=False)