#### Import modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#### Load subreddit data

In [4]:
# load data from CSV
topic_df = pd.read_csv('../data/subreddit_combine_title_body.csv')

In [5]:
# display sample data
topic_df.head(2)

Unnamed: 0,id,url,comms_num,created,subreddit,title_body
0,17j7oej,https://www.reddit.com/r/wine/comments/17j7oej...,743,2023-10-30 00:18:37,wine,[Megathread] How much is my wine worth? Is it ...
1,1gmbv5t,https://www.reddit.com/r/wine/comments/1gmbv5t...,16,2024-11-08 13:00:27,wine,"Free Talk Friday Bottle porn without notes, ra..."


#### Load class mapping data

In [7]:
# load data from CSV
class_df = pd.read_csv('../data/wine_beer_class.csv')

# convert words to lowercase
class_df['variety'] = class_df['variety'].str.lower()

In [8]:
# display sample rows
class_df.head(5)

Unnamed: 0,type,characteristic,variety
0,wine,taste,fruity
1,wine,taste,earthy
2,wine,taste,floral
3,wine,taste,spicy
4,wine,taste,herbal


In [9]:
# display dataframe shape
class_df.shape

(110, 3)

In [10]:
# get word list for vertorization
variety = pd.Series(class_df['variety'].unique()).str.lower()
class_words = {w: i for i, w in enumerate(variety)}

In [11]:
# vertorization only word from the `variety`
cvec = CountVectorizer(vocabulary=class_words, stop_words='english', ngram_range=(1, 2))
cvec_word = cvec.fit_transform(topic_df['title_body'])

In [12]:
# normalize words
lemma_token = WordNetLemmatizer()

# get words from vectorizer
words = [lemma_token.lemmatize(w) for w in cvec.get_feature_names_out()]

# generate word class dataframe
word_df = pd.DataFrame(cvec_word.toarray(), columns=words)

# copy id and subreddit from newly create word
word_df['id'] = topic_df['id']
word_df['subreddit'] = topic_df['subreddit']

In [13]:
# convert number to boolean for count only once per topic
beer_words_df = pd.DataFrame(data={'type': 'beer', 'topic_count': word_df[word_df['subreddit']=='beer'].astype(bool).sum()})
wine_words_df = pd.DataFrame(data={'type': 'wine', 'topic_count': word_df[word_df['subreddit']=='wine'].astype(bool).sum()})

# combine beer and wine words DataFrame using `word` as key
topic_words_df = pd.concat([beer_words_df, wine_words_df])

# drop unused columns
topic_words_df.drop(['id','subreddit'], inplace=True)

# copy word from the index to a new column
topic_words_df['word'] = topic_words_df.index

# reset index
topic_words_df.reset_index(drop=True, inplace=True)

In [14]:
topic_words_df.head()

Unnamed: 0,type,topic_count,word
0,beer,11,fruity
1,beer,2,earthy
2,beer,3,floral
3,beer,2,spicy
4,beer,2,herbal


In [15]:
# mapping word class to topic_word_df
def map_characteristic(word):
    topic_words_df.loc[
      (topic_words_df['type']==word['type']) \
    & (topic_words_df['word']==word['variety']), 'characteristic'] = word['characteristic']

class_df.apply(map_characteristic, axis=1)
topic_words_df.dropna(inplace=True)

In [16]:
# get characteristic for beer
beer_class_df = topic_words_df[topic_words_df['type']=='beer'] \
    .groupby(['characteristic','word']).sum() \
    .reset_index() \
    .sort_values(by=['characteristic','topic_count'], ascending=False)

beer_class_df.head()

Unnamed: 0,characteristic,word,type,topic_count
20,taste,sweet,beer,34
18,taste,bitter,beer,26
19,taste,sour,beer,20
16,raw material,hop,beer,16
17,raw material,malt,beer,15


In [17]:
# get characteristic for wine
wine_class_df = topic_words_df[topic_words_df['type']=='wine'] \
    .groupby(['characteristic','word']).sum() \
    .reset_index() \
    .sort_values(by=['characteristic','topic_count'], ascending=False)

wine_class_df.head()

Unnamed: 0,characteristic,word,type,topic_count
73,wine variety,pinot noir,wine,55
59,wine variety,chardonnay,wine,43
67,wine variety,merlot,wine,33
76,wine variety,riesling,wine,28
79,wine variety,syrah,wine,22


In [18]:
all_class_df = pd.concat([beer_class_df, wine_class_df]).reset_index(drop=True)

In [19]:
# convert compound score to binary value
def get_sum_compound(sentiment):
    if sentiment['compound'] <= -0.05:
        return 'negative'
    else:
        return 'not_negative'

In [20]:
# calculate sentiment for each topic
sia = SentimentIntensityAnalyzer()
topic_class_df = topic_df[['id','subreddit','title_body']].copy()
topic_class_df['sentiment'] = topic_class_df['title_body'].apply(lambda d: sia.polarity_scores(d))

# separate sentiment value to multiple columns
topic_class_df['sent_neg'] = topic_class_df['sentiment'].apply(lambda s: s['neg'])
topic_class_df['sent_neu'] = topic_class_df['sentiment'].apply(lambda s: s['neu'])
topic_class_df['sent_pos'] = topic_class_df['sentiment'].apply(lambda s: s['pos'])
topic_class_df['sent_comp'] = topic_class_df['sentiment'].apply(lambda s: s['compound'])
topic_class_df['sum_comp'] = topic_class_df['sentiment'].apply(get_sum_compound)
topic_class_df.set_index('id', inplace=True)

# concat sentiment with word verterization dataframe
topic_class_df = pd.concat([topic_class_df, word_df.set_index('id').drop(columns='subreddit')], axis=1)
topic_class_df.dropna(inplace=True)
topic_class_df.drop(columns=['sentiment'], inplace=True)
topic_class_df.to_csv('../data/topic_word_class.csv')

In [21]:
# show sample data
topic_class_df[['subreddit','title_body','sent_neg','sent_neu','sent_pos','sent_comp','sum_comp']].head()

Unnamed: 0_level_0,subreddit,title_body,sent_neg,sent_neu,sent_pos,sent_comp,sum_comp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
17j7oej,wine,[Megathread] How much is my wine worth? Is it ...,0.017,0.948,0.035,0.3527,not_negative
1gmbv5t,wine,"Free Talk Friday Bottle porn without notes, ra...",0.0,0.769,0.231,0.5106,not_negative
1gogepp,wine,Started Journey to Master I have great study m...,0.117,0.607,0.275,0.5994,not_negative
1goebub,wine,NV Pierre Peters Howdy Winos! Anyone have any ...,0.025,0.795,0.18,0.9197,not_negative
1goj0bf,wine,"Vouvray Chenin Blanc I mostly drink reds, but ...",0.0,0.739,0.261,0.988,not_negative


In [22]:
# update summarize binary sentiment for each word class
for i, word in all_class_df.iterrows():
    a = topic_class_df[(topic_class_df['subreddit']==word['type']) & topic_class_df[word['word']]]['sum_comp'].value_counts(normalize=True)
    all_class_df.loc[i, 'negative'] = a['negative'] if 'negative' in a else 0
    all_class_df.loc[i, 'not_negative'] = a['not_negative'] if 'not_negative' in a else 0

In [23]:
# show combine result
all_class_df.sort_values('characteristic').head()

Unnamed: 0,characteristic,word,type,topic_count,negative,not_negative
20,appearance,clarity,beer,0,0.0,0.0
18,appearance,foam,beer,9,0.4,0.6
19,appearance,color,beer,3,0.0,1.0
17,brand,stella,beer,8,0.5,0.5
16,brand,heineken,beer,12,0.181818,0.818182


In [24]:
# save result to CSV
all_class_df.dropna(inplace=True)
all_class_df.to_csv('../data/all_type_class.csv', index=False)