In [1]:
#!pip install tokenizers

In [2]:
import os
import sys

# Add paths for custom transformers
while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
sys.path.append('.')

In [None]:
import kagglehub
from dotenv import load_dotenv
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from src.custom_transformers import (
    CustomOneHotEncoder,
    CustomStandardScaler
)
import projects.proj_3_team_5.src.custom_transformers as project_transformers
from projects.proj_3_team_5.src.custom_transformers import CustomTokenizerVectorizer, ToDataFrameFromColumnTransformer



In [None]:
env_path = 'projects/proj_3_team_5/.env'
load_dotenv(env_path)

In [None]:
# Download latest version
path = kagglehub.dataset_download("ruthgn/beer-profile-and-ratings-data-set")

print("Path to dataset files:", path)

In [6]:
df_raw_path = os.getenv('RAW_DATA_DIR')
df_preprocessed_path = os.getenv('PREPROCESSED_DATA_DIR')

In [None]:
# Set the correct path to your raw data file if not already set
if not df_raw_path:
	# Example: update the filename as needed
	df_raw_path = os.path.join(path, "beer_profile_and_ratings.csv")

df_raw = pd.read_csv(df_raw_path)
df_raw.info()

In [None]:
df_raw.sample(5)

In [None]:
print("\nMissing Values:")
missing = df_raw.isnull().sum()
print(missing[missing > 0].sort_values(ascending=False))

# Add empty cells check
empty_cells = df_raw.eq('').sum()
print("\nEmpty String Values:")
print(empty_cells[empty_cells > 0].sort_values(ascending=False))

In [None]:
num_dupes = df_raw.duplicated().sum()
print(f"\nNumber of duplicate rows in train data: {num_dupes}")

In [None]:
# Check cardinality of categorical columns
categorical_cols = df_raw.select_dtypes(include=['object']).columns
cardinality = {col: df_raw[col].nunique() for col in categorical_cols}
print("Number of unique values in categorical columns:")
for col, count in sorted(cardinality.items(), key=lambda x: x[1], reverse=True):
    print(f"{col}: {count}")

In [None]:
# Check if all descriptions start with "Notes:"
all_start_with_notes = df_raw['Description'].str.startswith('Notes:').all()
print(f"All descriptions start with 'Notes:': {all_start_with_notes}")

# Show some examples if not all start with "Notes:"
if not all_start_with_notes:
    print("\nExamples of descriptions that don't start with 'Notes:':")
    non_notes = df_raw[~df_raw['Description'].str.startswith('Notes:')]['Description'].head()
    print(non_notes)


In [None]:
# Get numeric columns
numeric_cols = df_raw.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numeric columns:")
print(numeric_cols)


In [14]:
cleaning_pipeline = make_pipeline(
    project_transformers.TextCleaner(
        column='Description',
        patterns_to_remove=['^Notes:', 'error entering this description']
    ),
    project_transformers.CustomOutlierRemover(
        columns=numeric_cols,
        threshold=3
    )
)

In [None]:
df_cleaned = cleaning_pipeline.fit_transform(df_raw)

print("Original shape:", df_raw.shape)
print("Processed shape:", df_cleaned.shape)

In [None]:
cleaning_pipeline

In [None]:
# Count empty cells in Description column
empty_descriptions = df_cleaned['Description'].isna().sum() + (df_cleaned['Description'] == '').sum()
print(f"Number of empty cells in Description: {empty_descriptions}")


In [None]:
df_cleaned.sample(5)

In [19]:
df_cleaned_path = os.getenv('CLEANED_DATA_DIR')

In [None]:
df_cleaned.to_csv(df_cleaned_path, index=False)

In [21]:
transformers=ColumnTransformer([
            ('name_tok', CustomTokenizerVectorizer(column='Name', vocab_size=300), ['Name']),
            ('desc_tok', CustomTokenizerVectorizer(column='Description', vocab_size=1000), ['Description']),
            ('beer_tok', CustomTokenizerVectorizer(column='Beer Name (Full)', vocab_size=500), ['Beer Name (Full)']),
        ],
        remainder='passthrough', sparse_threshold=0, 
        verbose_feature_names_out=False)


In [22]:
final_pipeline = make_pipeline(
    transformers,
    ToDataFrameFromColumnTransformer(transformers),
    CustomOneHotEncoder(
        columns=['Style', 'Brewery'],
    ),
    CustomStandardScaler(
        columns=numeric_cols
    )
)

In [None]:
df_preprocessed = final_pipeline.fit_transform(df_cleaned)

In [None]:
final_pipeline

In [None]:
df_preprocessed.shape

In [None]:
df_preprocessed.sample(5)

In [None]:
df_preprocessed.to_csv(df_preprocessed_path, index=False)