In [None]:
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install -U scikit-learn
!pip install ipympl
!pip install seaborn
!pip install pyarrow
!pip install scipy
!pip install duckdb
!pip install polars

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import json
from scipy.stats import bootstrap
import scipy.stats as stats
import polars as pl
import duckdb
#pd.options.display.float_format = '{:,.0f}'.format

In [None]:
# dfreco = pd.read_csv("recommendations.csv")
# dfreco.to_parquet("recommendations.parquet")

In [None]:
dfreco = pl.read_parquet("recommendations.parquet")
#dfgame = pd.read_csv("games.csv")
#dfuser = pd.read_csv("users.csv")
#dfmeta = pd.read_json('games_metadata.json', lines=False)

In [None]:

dfreco.dtypes

In [None]:
def create_box_and_hist(data, column):
    fig, (ax_hist, ax_box) = plt.subplots(2, figsize=(4, 4), gridspec_kw={"height_ratios": (.85, .15)})
    ax_hist.hist(data[column], bins=20, color='skyblue', edgecolor='black')
    ax_hist.set_title('Histogram')
    ax_hist.set_xlabel(f'# {column}')
    ax_box.boxplot(data[column], vert=False, widths=0.7, patch_artist=True, boxprops=dict(facecolor='orange'))
    ax_box.set_xticks([])
    ax_box.set_yticks([])
    plt.tight_layout()
    plt.show()

def boot(data, column, stat, stat_label):
    rnd = 42
    res = bootstrap((data[column].values,), stat, confidence_level=0.9,
                    random_state=rnd, batch=1)
    fig, ax = plt.subplots()
    ax.hist(res.bootstrap_distribution, bins=25)
    ax.set_title('Bootstrap Distribution')
    ax.set_xlabel(stat_label)
    ax.set_ylabel('frequency')
    plt.show()
    return res

In [None]:
# load our users
dfuser = pd.read_csv("users.csv")
dfuser.set_index('user_id')
dfuser.info()

In [None]:
query = """SELECT
	r1.user_id,
	COUNT(r2.user_id) as reviewed_on_after,
	COUNT(case when r1.is_recommended = 1 then r1.app_id else NULL end) as games_recommended,
	COUNT(case when r1.is_recommended = 0 then r1.app_id else NULL end) as games_not_recommended,
	COUNT(case when r1.is_recommended = r2.is_recommended then r2.user_id else NULL end) as agreed_with,
	COUNT(case when r1.is_recommended <> r2.is_recommended then r2.user_id else NULL end) as disagreed_with
FROM 
	dfreco r1
	JOIN dfreco r2 ON r2.app_id = r1.app_id
WHERE 
  1=1
  AND r2.date >= r1.date
GROUP BY 
	r1.user_id"""



with pl.SQLContext(register_globals=True, eager_execution=True) as ctx:
    df_small = ctx.execute(query)
    df_small.to_pandas().info()

In [None]:
query = """SELECT
	r1.user_id,
	COUNT(r2.user_id) as reviewed_on_after,
	COUNT(case when r1.is_recommended = 1 then r1.app_id else NULL end) as games_recommended,
	COUNT(case when r1.is_recommended = 0 then r1.app_id else NULL end) as games_not_recommended,
	COUNT(case when r1.is_recommended = r2.is_recommended then r2.user_id else NULL end) as agreed_with,
	COUNT(case when r1.is_recommended <> r2.is_recommended then r2.user_id else NULL end) as disagreed_with
FROM 
	dfreco r1
	JOIN dfreco r2 ON r2.app_id = r1.app_id
WHERE 
  1=1
  AND r1.user_id <> r2.user_id
  AND r2.date >= r1.date
GROUP BY 
	r1.user_id"""

mined_recommendations = duckdb.sql(query).to_df()



In [None]:
mined_recommendations

In [None]:
stats.probplot(np.log(dfuser.reviews.values+1), plot=plt)
plt.title('Q-Q Plot of Log-Transformed Data')
plt.show()

In [None]:
from scipy.stats import shapiro

stat, p_value = shapiro(np.log(dfuser.reviews.values+1))
print('Shapiro-Wilk Test Statistic:', stat)
print('p-value:', p_value)

In [None]:
from scipy.stats import kstest

# Define the log-normal CDF function with the estimated parameters
shape, loc, scale = stats.lognorm.fit(dfuser.reviews.values+1, floc=0)
cdf = lambda x: stats.lognorm.cdf(x, shape, loc, scale)

stat, p_value = kstest(dfuser.reviews.values+1, cdf)
print('Kolmogorov-Smirnov Test Statistic:', stat)
print('p-value:', p_value)

In [None]:
from scipy.stats import anderson

result = anderson(np.log(dfuser.reviews.values+1), dist='norm')
print('Anderson-Darling Test Statistic:', result.statistic)
print('Critical Values:', result.critical_values)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import lognorm, chisquare

data = np.log(dfuser[dfuser.reviews>0].reviews.values)
# Assuming 'data' is your dataset
#data = np.random.lognormal(mean=0, sigma=1, size=1000)  # Example data

# Step 1: Calculate observed frequencies
observed_freq, bins = np.histogram(data, bins='auto', density=True)
bin_centers = (bins[:-1] + bins[1:]) / 2

# Step 2: Calculate expected frequencies based on log-normal distribution
shape, loc, scale = lognorm.fit(data, floc=0)
expected_freq = lognorm.pdf(bin_centers, shape, loc, scale) * np.diff(bins)

# Step 3: Scale expected frequencies to match the sum of observed frequencies
expected_freq_scaled = expected_freq * sum(observed_freq) / sum(expected_freq)

# Step 4: Perform the chi-square test
chi2_stat, p_value = chisquare(f_obs=observed_freq, f_exp=expected_freq_scaled)

print('Chi-Square Test Statistic:', chi2_stat)
print('p-value:', p_value)

# Optional: Plot histogram and PDF
plt.hist(data, bins=bins, density=True, alpha=0.6, color='g', label='Observed Data')
plt.plot(bin_centers, expected_freq, 'r--', linewidth=2, label='Expected Log-normal PDF')
plt.xlabel('Data')
plt.ylabel('Density')
plt.legend()
plt.title('Histogram and Expected Log-normal PDF')
plt.show()


In [None]:
create_box_and_hist(dfuser, 'reviews')

In [None]:
df_high_reviews = dfuser[dfuser.reviews >= np.percentile(dfuser.reviews.values, 99)]

In [None]:
create_box_and_hist(df_high_reviews, 'reviews')

In [None]:
# plot users bootstrap 
pd.plotting.bootstrap_plot(df_high_reviews.drop('products', axis=1).reviews, size=100000)
plt.show()

In [None]:
mean_stats = boot(df_high_reviews, 'reviews', np.mean, 'mean')
std_stats = boot(df_high_reviews, 'reviews', np.std, 'standard deviation')

In [None]:
[mean_stats.confidence_interval.low, mean_stats.bootstrap_distribution.mean(), mean_stats.confidence_interval.high]

In [None]:
high = np.percentile(df_high_reviews.reviews.values, 99)
low = np.percentile(df_high_reviews.reviews.values, 1)

df_users_clean = df_high_reviews[(df_high_reviews.reviews <= high)&(df_high_reviews.reviews >= low)]
create_box_and_hist(df_users_clean, 'reviews')



In [None]:
mean_stats.standard_error

In [None]:
# plot users bootstrap 
pd.plotting.bootstrap_plot(df_high_reviews.drop('products', axis=1).reviews, size=100000)

In [None]:
np.percentile(dfuser.reviews,99)

In [None]:
big_users = dfuser[dfuser.reviews>=24]
big_users.boxplot('reviews')

In [None]:
pd.plotting.bootstrap_plot(big_users.reviews, size=100000)

In [None]:
dfuser.std_reviews.mean()

In [None]:
dfuser.reviews.hist()

In [None]:
limited = dfuser[dfuser.reviews > 500]

limited.norm_reviews.hist()


In [None]:
(1.5*dfuser.reviews.quantile(0.95)-dfuser.reviews.quantile(0.10))+dfuser.reviews.quantile(0.95)

In [None]:
len(dfuser[dfuser.reviews > 21.5])

In [None]:
scaler = StandardScaler()
hu = dfuser[dfuser.reviews > 21.5][['reviews']].values
#scaler.fit_transform(hu)


In [None]:
dfuser['log_reviews'] = np.log10(dfuser.reviews+1)

In [None]:

scaler = StandardScaler()
hu = dfuser[['reviews']].values
dfuser['scaled_reviews'] = scaler.fit_transform(hu)

upper_range = (1.5*dfuser.scaled_reviews.quantile(0.90)-dfuser.scaled_reviews.quantile(0.10))+dfuser.scaled_reviews.quantile(0.90)

high_reviews = dfuser[dfuser.scaled_reviews > upper_range]

high_reviews

In [None]:
#Let's cluster users on number of reviews performed and see if we see a definite pattern of users
scaler = StandardScaler()
X = scaler.fit_transform(dfuser[['reviews']].sample(n=10000, random_state=1).values)
n_clusters = [2,3,4,5,6,7,8,9,10,11,12,13,114,15,16]

for n in n_clusters:
    cmod = MiniBatchKMeans(n_clusters=n, random_state=42)
    labels = cmod.fit_predict(X)
    
    silhouette_avg = silhouette_score(X, labels)
    print(
        "For n_clusters =",
        n,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    sample_silhouette_values = silhouette_samples(X, labels)

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (n + 1) * 10])
    
    y_lower = 10
    for i in range(n):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(labels.astype(float) / n)
    ax2.scatter(
        X[:, 0], X[:, 0], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    # Labeling the clusters
    centers = cmod.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 0],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[0], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n,
        fontsize=14,
        fontweight="bold",
    )

plt.show()



In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(dfuser[['reviews']].values)
cmod = MiniBatchKMeans(n_clusters=2, random_state=42)
labels = cmod.fit_predict(X)



In [None]:
dfuser['anomaly'] = labels

dfuser[~(dfuser['products'] < dfuser['reviews']) & (dfuser['anomaly']==1)]

In [None]:
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
%matplotlib inline

# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
X, y = make_blobs(
    n_samples=500,
    n_features=2,
    centers=4,
    cluster_std=1,
    center_box=(-10.0, 10.0),
    shuffle=True,
    random_state=1,
)  # For reproducibility

range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()