In [1]:
import polars as pl

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
# Reading data
movie_columns = ["item_id", "movie_title", "release_date", "video_release_date", "IMDb_URL"] + [f"genre_{i}" for i in range(19)]

ratings = pl.read_csv(
    'ml-100k/u.data',
    separator='\t',
    has_header=False,
    new_columns=['user_id', 'item_id', 'rating', 'timestamp']
)

movies = pl.read_csv(
    'ml-100k/u.item',
    separator="|",
    has_header=False,
    new_columns=movie_columns,
    encoding="cp1251",
    ignore_errors=True
)


users = pl.read_csv(
    'ml-100k/u.user',
    separator='|',
    has_header=False,
    new_columns=['user_id', 'age', 'gender', 'occupation', 'zip_code'],
    encoding="cp1251",
    ignore_errors=True
)


In [4]:
# Join the ratings and movies data on 'movie_id'
ratings_with_titles = ratings.join(movies, on="item_id")
dataset = ratings_with_titles.join(users, on='user_id')

# Modifying target
dataset = dataset.with_columns(
    pl.when(pl.col('rating') == 5)
    .then(1)
    .otherwise(0)
    .alias("target")
)

# Drop column
dataset = dataset.drop(['video_release_date'])

# Drop NA values
dataset = dataset.drop_nulls()

# Encode with LabelEncoder
le_gender = LabelEncoder()
le_occupation = LabelEncoder()

dataset = dataset.with_columns([
    pl.Series("gender", le_gender.fit_transform(dataset["gender"].to_list())),
    pl.Series("occupation", le_occupation.fit_transform(dataset["occupation"].to_list()))
])

# Show the result
print(dataset.head())

shape: (5, 31)
┌─────────┬─────────┬────────┬───────────┬───┬────────┬────────────┬──────────┬────────┐
│ user_id ┆ item_id ┆ rating ┆ timestamp ┆ … ┆ gender ┆ occupation ┆ zip_code ┆ target │
│ ---     ┆ ---     ┆ ---    ┆ ---       ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---    │
│ i64     ┆ i64     ┆ i64    ┆ i64       ┆   ┆ i64    ┆ i64        ┆ str      ┆ i32    │
╞═════════╪═════════╪════════╪═══════════╪═══╪════════╪════════════╪══════════╪════════╡
│ 196     ┆ 242     ┆ 3      ┆ 881250949 ┆ … ┆ 1      ┆ 20         ┆ 55105    ┆ 0      │
│ 186     ┆ 302     ┆ 3      ┆ 891717742 ┆ … ┆ 0      ┆ 6          ┆ 00000    ┆ 0      │
│ 22      ┆ 377     ┆ 1      ┆ 878887116 ┆ … ┆ 1      ┆ 20         ┆ 40206    ┆ 0      │
│ 244     ┆ 51      ┆ 2      ┆ 880606923 ┆ … ┆ 1      ┆ 19         ┆ 80525    ┆ 0      │
│ 166     ┆ 346     ┆ 1      ┆ 886397596 ┆ … ┆ 1      ┆ 3          ┆ 55113    ┆ 0      │
└─────────┴─────────┴────────┴───────────┴───┴────────┴────────────┴──────────┴────────┘


In [5]:
train_columns = ['timestamp', 'age', 'gender', 'occupation', 'target'] + [f"genre_{i}" for i in range(19)]
dataset = dataset[train_columns]
# dataset['timestamp'].is_null()

In [6]:
# возьмем последние 5% по времени в качестве отложенной выборки
# все что до этого момента, будем использовать для обучения модели
ts_threshold = dataset['timestamp'].quantile(0.95)
train = dataset.filter(pl.col('timestamp') < ts_threshold)
test = dataset.filter(pl.col('timestamp') >= ts_threshold)

X_train, y_train = train.select(pl.exclude('target')), train['target']
X_test, y_test = test.select(pl.exclude('target')), test['target']

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train.to_pandas(), y_train.to_pandas())

### Результаты по исследованию данных
1. Классовый имбаланс `class_imbalance`
2. Null в данных
3.

In [7]:
# Initialize and train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_resampled, y_resampled)

# Make predictions
y_pred = clf.predict(X_test)

In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.61
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.74      0.74      3657
           1       0.27      0.26      0.27      1343

    accuracy                           0.61      5000
   macro avg       0.50      0.50      0.50      5000
weighted avg       0.61      0.61      0.61      5000

