#### Load modules

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#### Load CSV data for each subreddit

In [4]:
wine_df = pd.DataFrame()
beer_df = pd.DataFrame()
for date in ['all']:
    wine_df = pd.concat([wine_df, pd.read_csv(f'../data/subreddit_wine_data-{date}.csv', encoding='utf-8-sig')])
    beer_df = pd.concat([beer_df, pd.read_csv(f'../data/subreddit_beer_data-{date}.csv', encoding='utf-8-sig')])

In [5]:
wine_df.shape, beer_df.shape

((4884, 7), (4903, 7))

#### Combine both DataFrame

In [7]:
# set subreddit feature for each df
wine_df['subreddit'] = 'wine'
beer_df['subreddit'] = 'beer'

# concatenate both df
combine_df = pd.concat([wine_df, beer_df])

# combine title and body into title_body
combine_df['title_body'] = combine_df['title'] + ' ' + combine_df['body']

In [8]:
# drop duplicate from merging
combine_df.drop_duplicates(subset=['id'], inplace=True)

In [9]:
# drop row with null values
combine_df.dropna(inplace=True)

In [10]:
# check for null value rows
combine_df['title_body'].isnull().sum()

0

In [11]:
# display sample records
combine_df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,subreddit,title_body
0,[Megathread] How much is my wine worth? Is it ...,98,17j7oej,https://www.reddit.com/r/wine/comments/17j7oej...,743,2023-10-30 00:18:37,We're expanding the scope of the megathread a ...,wine,[Megathread] How much is my wine worth? Is it ...
1,Free Talk Friday,1,1gmbv5t,https://www.reddit.com/r/wine/comments/1gmbv5t...,16,2024-11-08 13:00:27,"Bottle porn without notes, random musings, off...",wine,"Free Talk Friday Bottle porn without notes, ra..."
2,Started Journey to Master,219,1gogepp,https://i.redd.it/6gdvjahxb60e1.jpeg,21,2024-11-11 08:19:05,I have great study material for anyone interes...,wine,Started Journey to Master I have great study m...
4,NV Pierre Peters,41,1goebub,https://www.reddit.com/gallery/1goebub,7,2024-11-11 06:36:49,Howdy Winos! Anyone have any idea how old this...,wine,NV Pierre Peters Howdy Winos! Anyone have any ...
5,Vouvray Chenin Blanc,16,1goj0bf,https://www.reddit.com/r/wine/comments/1goj0bf...,13,2024-11-11 10:37:00,"I mostly drink reds, but recently started tryi...",wine,"Vouvray Chenin Blanc I mostly drink reds, but ..."


#### Export CSV data

In [13]:
# display DataFrame shape
combine_df.shape

(2009, 9)

In [14]:
# write data to CSV file
combine_df[['id','url','comms_num','created','subreddit','title_body']] \
.to_csv('../data/subreddit_combine_title_body.csv', index=False)