In [1]:
import polars as pl

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
ratings = pl.read_csv('mlflow/ml-100k/u.data', separator='\t', has_header=False, new_columns=['user_id', 'item_id', 'rating', 'timestamp'])
users = pl.read_csv('mlflow/ml-100k/u.user', separator='|', has_header=False, new_columns=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
movies = pl.read_csv('mlflow/ml-100k/u.item', separator='|', has_header=False, encoding='latin-1', new_columns=[
    'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 
    'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 
    'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
])

data = ratings.join(users, on='user_id').join(movies, on='item_id')

data = data.with_columns((data['rating'] == 5).cast(pl.Int32).alias('is_rating_5'))

data = data.drop(['video_release_date'])
data = data.drop_nulls()

feature_columns = ['age', 'gender', 'occupation'] + movies.columns[5:]
X = data.select(feature_columns)
y = data['is_rating_5']

# Encode categorical variables
le_gender = LabelEncoder()
X = X.with_columns(pl.Series('gender', le_gender.fit_transform(X['gender'].to_list())))

le_occupation = LabelEncoder()
X = X.with_columns(pl.Series('occupation', le_occupation.fit_transform(X['occupation'].to_list())))

# Convert Polars DataFrames to NumPy arrays for sklearn
X_np = X.to_numpy()
y_np = y.to_numpy()

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42, stratify=y_np)

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

In [13]:
import json

data_dict = X.sample(n=1).to_pandas().to_json()
json.dumps(data_dict)

'"{\\"age\\":{\\"0\\":19},\\"gender\\":{\\"0\\":0},\\"occupation\\":{\\"0\\":18},\\"unknown\\":{\\"0\\":0},\\"Action\\":{\\"0\\":0},\\"Adventure\\":{\\"0\\":0},\\"Animation\\":{\\"0\\":0},\\"Children\'s\\":{\\"0\\":0},\\"Comedy\\":{\\"0\\":1},\\"Crime\\":{\\"0\\":0},\\"Documentary\\":{\\"0\\":0},\\"Drama\\":{\\"0\\":0},\\"Fantasy\\":{\\"0\\":0},\\"Film-Noir\\":{\\"0\\":0},\\"Horror\\":{\\"0\\":0},\\"Musical\\":{\\"0\\":0},\\"Mystery\\":{\\"0\\":0},\\"Romance\\":{\\"0\\":1},\\"Sci-Fi\\":{\\"0\\":0},\\"Thriller\\":{\\"0\\":0},\\"War\\":{\\"0\\":0},\\"Western\\":{\\"0\\":0}}"'