In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#### Load sub Reddit data

In [3]:
topic_df = pd.read_csv('../data/subreddit_combine_title_body.csv')

In [4]:
topic_df.head(2)

Unnamed: 0,id,url,comms_num,created,subreddit,title_body
0,17j7oej,https://www.reddit.com/r/wine/comments/17j7oej...,743,2023-10-30 00:18:37,wine,[Megathread] How much is my wine worth? Is it ...
1,1gmbv5t,https://www.reddit.com/r/wine/comments/1gmbv5t...,16,2024-11-08 13:00:27,wine,"Free Talk Friday Bottle porn without notes, ra..."


In [5]:
topic_df.shape

(2009, 6)

#### Load Class Mapping Data

In [7]:
class_df = pd.read_csv('../data/wine_beer_merged.csv')

In [8]:
class_df.head(5)

Unnamed: 0,type,characteristic,variety,word_count
0,wine,taste,fruity,18
1,wine,taste,earthy,26
2,wine,taste,floral,20
3,wine,taste,spicy,16
4,wine,taste,herbal,3


In [9]:
class_df.shape

(111, 4)

In [10]:
# normalize words
lemma_token = WordNetLemmatizer()

def lemmatize(words):
    return ' '.join([lemma_token.lemmatize(w) for w in words.split()])

topic_df['title_body_norm'] = topic_df['title_body'].apply(lemmatize)

In [11]:
# get word list for vertorization
variety = pd.Series(class_df['variety'].unique()).str.lower()
class_words = {w: i for i, w in enumerate(variety)}

In [12]:
# vectorize only word from the `variety`
cvec = CountVectorizer(vocabulary=class_words, stop_words='english')
cvec_word = cvec.fit_transform(topic_df['title_body_norm'])

In [13]:
# get words from vectorizer
words = cvec.get_feature_names_out()
word_df = pd.DataFrame(cvec_word.toarray(), columns=words)
word_df['id'] = topic_df['id']
word_df['subreddit'] = topic_df['subreddit']

In [14]:
# convert number to boolean for count only once per topic
beer_words_df = pd.DataFrame(data={'type': 'beer', 'topic_count': word_df[word_df['subreddit']=='beer'].astype(bool).sum()})
wine_words_df = pd.DataFrame(data={'type': 'wine', 'topic_count': word_df[word_df['subreddit']=='wine'].astype(bool).sum()})

# combine beer and wine words DataFrame using `word` as key
topic_words_df = pd.concat([beer_words_df, wine_words_df])

# drop unused columns
topic_words_df.drop(['id','subreddit'], inplace=True)

# copy word from the index to a new column
topic_words_df['word'] = topic_words_df.index

# reset index
topic_words_df.reset_index(drop=True, inplace=True)

In [15]:
topic_words_df.head()

Unnamed: 0,type,topic_count,word
0,beer,11,fruity
1,beer,2,earthy
2,beer,3,floral
3,beer,2,spicy
4,beer,2,herbal


In [16]:
def map_characteristic(word):
    topic_words_df.loc[
      (topic_words_df['type']==word['type']) \
    & (topic_words_df['word']==word['variety']), 'characteristic'] = word['characteristic']

In [17]:
# mapping word class to topic_word_df
class_df.apply(map_characteristic, axis=1)
topic_words_df.dropna(inplace=True)

In [18]:
# get characteristic for beer
beer_class_df = topic_words_df[topic_words_df['type']=='beer'] \
    .groupby(['characteristic','word']).sum() \
    .reset_index() \
    .sort_values(by=['characteristic','topic_count'], ascending=False)

beer_class_df

Unnamed: 0,characteristic,word,type,topic_count
15,taste,sweet,beer,34
13,taste,bitter,beer,27
14,taste,sour,beer,25
11,raw material,hop,beer,26
12,raw material,malt,beer,17
9,mouthfeel,light,beer,60
10,mouthfeel,medium,beer,3
8,mouthfeel,full,beer,0
7,character,floral,beer,3
6,character,earthy,beer,2


In [93]:
# get characteristic for wine
wine_class_df = topic_words_df[topic_words_df['type']=='wine'] \
    .groupby(['characteristic','word']).sum() \
    .reset_index() \
    .sort_values(by=['characteristic','topic_count'], ascending=False)

wine_class_df

Unnamed: 0,characteristic,word,type,topic_count
20,taste,cherry,wine,79
26,taste,dry,wine,66
42,taste,plum,wine,51
16,taste,balanced,wine,49
49,taste,vanilla,wine,38
27,taste,earthy,wine,35
17,taste,blackberry,wine,30
29,taste,fruity,wine,25
46,taste,smooth,wine,25
43,taste,rich,wine,24


In [119]:
all_class_df = pd.concat([beer_class_df, wine_class_df]).reset_index(drop=True)
all_class_df.to_csv('../data/all_type_class.csv')

In [121]:
all_class_df.head()

Unnamed: 0,characteristic,word,type,topic_count
0,taste,sweet,beer,34
1,taste,bitter,beer,27
2,taste,sour,beer,25
3,raw material,hop,beer,26
4,raw material,malt,beer,17


In [21]:
def get_sum_compound(sentiment):
    if sentiment['compound'] <= -0.05:
        return 'negative'
    else:
        return 'not_negative'

In [89]:
# calculate sentiment for each topic
sia = SentimentIntensityAnalyzer()
topic_class_df = topic_df[['id','subreddit','title_body']].copy()
topic_class_df['sentiment'] = topic_class_df['title_body'].apply(lambda d: sia.polarity_scores(d))
# separate sentiment value to multiple columns
topic_class_df['sent_neg'] = topic_class_df['sentiment'].apply(lambda s: s['neg'])
topic_class_df['sent_neu'] = topic_class_df['sentiment'].apply(lambda s: s['neu'])
topic_class_df['sent_pos'] = topic_class_df['sentiment'].apply(lambda s: s['pos'])
topic_class_df['sent_comp'] = topic_class_df['sentiment'].apply(lambda s: s['compound'])
topic_class_df['sum_comp'] = topic_class_df['sentiment'].apply(get_sum_compound)
topic_class_df.set_index('id', inplace=True)

topic_class_df = pd.concat([topic_class_df, word_df.set_index('id')], axis=1)
topic_class_df.dropna(inplace=True)
topic_class_df.drop(columns='sentiment', inplace=True)
topic_class_df.to_csv('../data/topic_class.csv')

In [91]:
topic_class_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2009 entries, 17j7oej to 1gudbt0
Columns: 112 entries, subreddit to subreddit
dtypes: float64(4), int64(104), object(4)
memory usage: 1.7+ MB


In [63]:
topic_class_df[['subreddit','title_body','sent_neg','sent_neu','sent_pos','sent_comp','sum_comp']].head()

Unnamed: 0_level_0,subreddit,subreddit,title_body,sent_neg,sent_neu,sent_pos,sent_comp,sum_comp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17j7oej,wine,wine,[Megathread] How much is my wine worth? Is it ...,0.017,0.948,0.035,0.3527,not_negative
1gmbv5t,wine,wine,"Free Talk Friday Bottle porn without notes, ra...",0.0,0.769,0.231,0.5106,not_negative
1gogepp,wine,wine,Started Journey to Master I have great study m...,0.117,0.607,0.275,0.5994,not_negative
1goebub,wine,wine,NV Pierre Peters Howdy Winos! Anyone have any ...,0.025,0.795,0.18,0.9197,not_negative
1goj0bf,wine,wine,"Vouvray Chenin Blanc I mostly drink reds, but ...",0.0,0.739,0.261,0.988,not_negative


In [111]:
# for charracter in class_df['characteristic'].unique():
#   for varities in class_df[class_df['characteristic']==charracter]['varities'].unique():
#        pass

In [123]:
topic_class_df

Unnamed: 0_level_0,subreddit,title_body,sent_neg,sent_neu,sent_pos,sent_comp,sum_comp,fruity,earthy,floral,...,citrusy,color,clarity,foam,corona,heineken,guinness,stella,modelo,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17j7oej,wine,[Megathread] How much is my wine worth? Is it ...,0.017,0.948,0.035,0.3527,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,wine
1gmbv5t,wine,"Free Talk Friday Bottle porn without notes, ra...",0.000,0.769,0.231,0.5106,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,wine
1gogepp,wine,Started Journey to Master I have great study m...,0.117,0.607,0.275,0.5994,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,wine
1goebub,wine,NV Pierre Peters Howdy Winos! Anyone have any ...,0.025,0.795,0.180,0.9197,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,wine
1goj0bf,wine,"Vouvray Chenin Blanc I mostly drink reds, but ...",0.000,0.739,0.261,0.9880,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,wine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1guj6yg,beer,Breweries near Branson? Hello fellow beer nuts...,0.066,0.770,0.164,0.5362,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,beer
1guo4ee,beer,Fat Ox ale alternative Fat Ox is made by Flyi...,0.025,0.808,0.166,0.9722,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,beer
1gulbms,beer,Beer recommendations! Hello! Newly 21 year old...,0.102,0.575,0.323,0.9862,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,beer
1gugpas,beer,Hershey yuengling Does anyone think / know if ...,0.000,1.000,0.000,0.0000,not_negative,0,0,0,...,0,0,0,0,0,0,0,0,0,beer
