In [1]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Imports for NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
df = pd.read_csv("BeerDataScienceProject.tar.bz2", compression="bz2")

In [3]:
df.dtypes

beer_ABV              float64
beer_beerId             int64
beer_brewerId           int64
beer_name              object
beer_style             object
review_appearance     float64
review_palette        float64
review_overall        float64
review_taste          float64
review_profileName     object
review_aroma          float64
review_text            object
review_time             int64
dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528870 entries, 0 to 528869
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   beer_ABV            508590 non-null  float64
 1   beer_beerId         528870 non-null  int64  
 2   beer_brewerId       528870 non-null  int64  
 3   beer_name           528870 non-null  object 
 4   beer_style          528870 non-null  object 
 5   review_appearance   528870 non-null  float64
 6   review_palette      528870 non-null  float64
 7   review_overall      528870 non-null  float64
 8   review_taste        528870 non-null  float64
 9   review_profileName  528755 non-null  object 
 10  review_aroma        528870 non-null  float64
 11  review_text         528751 non-null  object 
 12  review_time         528870 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 52.5+ MB


In [5]:
df.describe()

Unnamed: 0,beer_ABV,beer_beerId,beer_brewerId,review_appearance,review_palette,review_overall,review_taste,review_aroma,review_time
count,508590.0,528870.0,528870.0,528870.0,528870.0,528870.0,528870.0,528870.0,528870.0
mean,7.017442,22098.466016,2598.423429,3.864522,3.758926,3.833197,3.765993,3.81735,1224885000.0
std,2.20446,22158.284352,5281.80535,0.60401,0.685335,0.709962,0.669018,0.718903,76056000.0
min,0.01,3.0,1.0,0.0,1.0,0.0,1.0,1.0,884390400.0
25%,5.3,1745.0,132.0,3.5,3.5,3.5,3.5,3.5,1174613000.0
50%,6.5,14368.0,394.0,4.0,4.0,4.0,4.0,4.0,1240366000.0
75%,8.5,40528.0,1475.0,4.0,4.0,4.5,4.0,4.5,1288560000.0
max,57.7,77310.0,27980.0,5.0,5.0,5.0,5.0,5.0,1326277000.0


In [6]:
print(df.isnull().sum())

beer_ABV              20280
beer_beerId               0
beer_brewerId             0
beer_name                 0
beer_style                0
review_appearance         0
review_palette            0
review_overall            0
review_taste              0
review_profileName      115
review_aroma              0
review_text             119
review_time               0
dtype: int64


### Using the median to impute abv, which is a more robust variable then mean against outliers

In [7]:
# Calculate median ABV for each beer style excluding NaN values
median_abvs = df.groupby('beer_style')['beer_ABV'].median()

In [8]:
def impute_abv(row):
    if pd.isna(row['beer_ABV']):
        return median_abvs[row['beer_style']]
    else:
        return row['beer_ABV']



In [9]:
# Apply the function to fill missing ABV values
df['beer_ABV'] = df.apply(impute_abv, axis=1)

In [10]:
df['beer_ABV'].mean()

6.975648382400212

In [11]:
print(df.isnull().sum())

beer_ABV                0
beer_beerId             0
beer_brewerId           0
beer_name               0
beer_style              0
review_appearance       0
review_palette          0
review_overall          0
review_taste            0
review_profileName    115
review_aroma            0
review_text           119
review_time             0
dtype: int64


### 'review_profileName' and 'review_text' have null values. Imputing them with 'unknown', which will not effect data integrity

In [12]:
# Replace missing values in 'review_profileName' with 'Unknown'
df['review_profileName'] = df['review_profileName'].fillna('Unknown')

In [13]:
# Replace missing values in 'review_profileName' with 'Unknown'
df['review_text'] = df['review_text'].fillna('Unknown')

In [14]:
df['review_text']

0         A lot of foam. But a lot. In the smell some ba...
1         Dark red color, light beige foam, average. In ...
2         Almost totally black. Beige foam, quite compac...
3         Golden yellow color. White, compact foam, quit...
4         According to the website, the style for the Ca...
                                ...                        
528865    A-pours a reddish amber that looks very nice,l...
528866    I don't really have anything special to say ab...
528867    Had this on tap at Vreny's Beirgarten A - Came...
528868    Purchased at Market Cross Pub in carlisle, PA....
528869    I ordered a mug of this beer at Schnitzelhaus,...
Name: review_text, Length: 528870, dtype: object

In [15]:
df.head(2)

Unnamed: 0,beer_ABV,beer_beerId,beer_brewerId,beer_name,beer_style,review_appearance,review_palette,review_overall,review_taste,review_profileName,review_aroma,review_text,review_time
0,5.0,47986,10325,Sausa Weizen,Hefeweizen,2.5,2.0,1.5,1.5,stcules,1.5,A lot of foam. But a lot. In the smell some ba...,1234817823
1,6.2,48213,10325,Red Moon,English Strong Ale,3.0,2.5,3.0,3.0,stcules,3.0,"Dark red color, light beige foam, average. In ...",1235915097


### Cleaning and preprocess textual data for NLP.

In [16]:
# Initialize necessary NLP tools
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
stop_words.add('beer')  # Add "beer" to the stop words list
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tsher\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tsher\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
# Data Cleaning function
def clean_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

In [18]:
df['cleaned_review_text'] = df['review_text'].apply(clean_text)

In [19]:
# Tokenization, Stop Words Removal, and Lemmatization
def preprocess_text(text):
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [20]:
df['processed_review_text'] = df['cleaned_review_text'].apply(preprocess_text)

In [21]:
df[:1000].to_csv('Partial2.csv')

In [22]:
df.to_csv('cleaned_beer.csv')

In [23]:
df.columns

Index(['beer_ABV', 'beer_beerId', 'beer_brewerId', 'beer_name', 'beer_style',
       'review_appearance', 'review_palette', 'review_overall', 'review_taste',
       'review_profileName', 'review_aroma', 'review_text', 'review_time',
       'cleaned_review_text', 'processed_review_text'],
      dtype='object')