In [402]:
import numpy as np
import pandas as pd
from pycaret.anomaly import setup, create_model, predict_model

data=np.load("week_9_data/training_data_with_labels.npz")
data_new = np.load("week_10_data/first_batch_test_with_labels.npz")

X_existing = data["X"]
y_existing = data["y"]
X_existing[:, 0] = X_existing[:, 0] - 100
y_existing[:, 0] = y_existing[:, 0] - 100

X_new = data_new["X"]
y_new = data_new["y"]

In [403]:
X = np.concatenate((X_existing, X_new), axis=0)
y = np.concatenate((y_existing, y_new), axis=0)

XX=pd.DataFrame(X)
yy=pd.DataFrame(y)
XX.rename(columns={0:"user",1:"item",2:"rating"},inplace=True)
print("# of items:", XX['item'].unique().shape[0])

# of items: 1197


In [404]:
yy.rename(columns={0:"user",1:"label"},inplace=True)
yy.head()

Unnamed: 0,user,label
0,0,1
1,1,0
2,2,0
3,3,0
4,4,0


In [405]:
merged_data = pd.merge(XX, yy, on='user', how='left')
merged_data.head(110)

Unnamed: 0,user,item,rating,label
0,99,0,3,0
1,99,4,5,0
2,99,7,4,0
3,99,12,4,0
4,99,13,5,0
...,...,...,...,...
105,31,164,3,0
106,31,171,2,0
107,31,174,2,0
108,31,175,3,0


In [406]:
len(merged_data["user"].unique())

2200

In [407]:
groups_X = merged_data.groupby('user')
groups_X = [group for _, group in groups_X]

In [408]:
from scipy.stats import entropy

# Feature Engineering
features_list = []

for group in groups_X:
    user = group['user'].iloc[0]
    label = group['label'].iloc[0] 
    min_rating = group['rating'].min()
    total_interactions = group.shape[0]
    rating_range = (group['rating'].max() - group['rating'].min())
    unique_items = group['item'].nunique()
    ratings_count = group['rating'].value_counts(normalize=True)
    ratings_entropy = entropy(ratings_count)
    item_diversity = group['item'].nunique() / merged_data['item'].nunique()
    proportion_of_5_stars = (group['rating'] == 5).sum() / group.shape[0]
    proportion_of_1_star = (group['rating'] == 1).sum() / group.shape[0]
    mode_count = group['rating'].value_counts(dropna=True).max()
    mode_ratio = mode_count / total_interactions
    median_rating = group['rating'].median()
    average_rating = group['rating'].mean()
    

    score = 0
    for item in group['item']:
        score = score + item

    rating_variance = group['rating'].var()
    extreme_ratings_ratio = ((group['rating'] == 1) | (group['rating'] == 5)).sum() / total_interactions

    features_list.append({
        'user': user,
        'label': label,
        # 'min_rating': min_rating,
        'total_interactions': total_interactions,
        'rating_range': rating_range,
        # 'unique_items': unique_items,
        # 'score': score,
        # 'rating_variance': rating_variance,
        'extreme_ratings_ratio': extreme_ratings_ratio,
        # 'ratings_entropy': ratings_entropy,
        # 'item_diversity': item_diversity,
        'proportion_of_5_stars': proportion_of_5_stars,
        'proportion_of_1_star': proportion_of_1_star,
        'mode_ratio': mode_ratio,
        # 'median_rating': median_rating,
        # 'average_rating': average_rating
    })

# Create a DataFrame with the features
features_df = pd.DataFrame(features_list)
# min_rating_0 = features_df['min_rating'] == 0
# features_df['min_rating_0'] = min_rating_0.astype(int)
features_df.head()

Unnamed: 0,user,label,total_interactions,rating_range,extreme_ratings_ratio,proportion_of_5_stars,proportion_of_1_star,mode_ratio
0,0,1,88,4,0.147727,0.079545,0.068182,0.454545
1,1,0,363,5,0.336088,0.049587,0.286501,0.286501
2,2,0,52,3,0.480769,0.480769,0.0,0.480769
3,3,0,173,3,0.00578,0.00578,0.0,0.543353
4,4,0,92,4,0.423913,0.402174,0.021739,0.445652


In [409]:
# # Define thresholds
# rare_threshold = 400  
# disliked_threshold = 4 

# # Calculate the count of ratings per item
# item_counts = merged_data['item'].value_counts()

# # Find rare items based on the threshold
# rare_items = item_counts[item_counts < rare_threshold].index

# # Calculate average rating per item
# item_avg_rating = merged_data.groupby('item')['rating'].mean()

# # Find disliked items based on the threshold
# disliked_items = item_avg_rating[item_avg_rating < disliked_threshold].index

# # Mark interactions with rare and disliked items
# merged_data['rare_interaction'] = merged_data['item'].isin(rare_items).astype(int)
# merged_data['disliked_interaction'] = merged_data['item'].isin(disliked_items).astype(int)

# # Calculate the mean proportion of rare and disliked interactions per user
# user_rare_disliked_interaction = merged_data.groupby('user').agg({
#     'rare_interaction': 'mean',  # Proportion of interactions with rarely rated items
#     'disliked_interaction': 'mean'  # Proportion of interactions with disliked items

# }).reset_index()

# features_df = features_df.merge(user_rare_disliked_interaction, on='user', how='left')

# item_variance = merged_data.groupby('item')['rating'].var().reset_index(name='item_variance')
# merged_data_var = merged_data.merge(item_variance, on='item', how='left')

# user_avg_item_variance = merged_data_var.groupby('user')['item_variance'].mean().reset_index(name='avg_item_variance')

# # Merge this new feature into features_df
# features_df = features_df.merge(user_avg_item_variance, on='user', how='left')


In [410]:
features_df.drop(columns=['user'], inplace=True)
features_df['label'].value_counts()

label
0    2000
1     200
Name: count, dtype: int64

In [411]:
from sklearn.model_selection import train_test_split

X = features_df.drop(columns=['label'])
y = features_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [412]:
# Set up PyCaret environment
model = setup(data=X_train, normalize=True, session_id=42)
print(model)
# Create and tune the model
iforest = create_model('lof', fraction=0.1)

result = assign_model(iforest)
result['Anomaly'].value_counts()

Unnamed: 0,Description,Value
0,Session id,42
1,Original data shape,"(1760, 6)"
2,Transformed data shape,"(1760, 6)"
3,Numeric features,6
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Normalize,True
9,Normalize method,zscore


<pycaret.anomaly.oop.AnomalyExperiment object at 0x2832b2140>


Anomaly
0    1584
1     176
Name: count, dtype: int64

In [413]:
from sklearn.metrics import roc_auc_score

# predictions = predict_model(iforest, data = X_test)
# predictions.head(20)

In [414]:
# Calculate ROC AUC
roc_auc = roc_auc_score(y_train, result['Anomaly'])
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.4931249999999999
