In [None]:
%pip install pandas
%pip install numpy
%pip install scikit-learn

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
sessions_path = 'content/sessions.jsonl'

sessions = pd.read_json(sessions_path, lines=True, encoding='utf-8')


tracks_path = 'content/tracks.jsonl'

tracks = pd.read_json(tracks_path, lines=True, encoding='utf-8')



users_path = 'content/users.jsonl'

users = pd.read_json(users_path, lines=True, encoding='utf-8')


artists_path = 'content/artists.jsonl'

artists = pd.read_json(artists_path, lines=True, encoding='utf-8')


In [3]:
print(f'sessions: {sessions.columns}')
print(f'tracks: {tracks.columns}')
print(f'users: {users.columns}')
print(f'artists: {artists.columns}')


sessions: Index(['timestamp', 'user_id', 'track_id', 'event_type', 'session_id'], dtype='object')
tracks: Index(['id', 'artist_id', 'name', 'popularity', 'duration_ms', 'explicit',
       'release_date', 'danceability', 'energy', 'key', 'mode', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature'],
      dtype='object')
users: Index(['user_id', 'name', 'city', 'street', 'favourite_genres',
       'premium_user'],
      dtype='object')
artists: Index(['id', 'name', 'genres'], dtype='object')


In [4]:
# sessions: Index(['timestamp', 'user_id', 'track_id', 'event_type', 'session_id'], dtype='object')
track_id_duration = dict(zip(tracks['id'], tracks['duration_ms']))
# Map values to a new column using a dictionary
sessions['track_len'] = sessions['track_id'].map(track_id_duration)
sessions['duration_ms'] = 0
# [conditions, what to modify] = new value for selected by condtions columns to modify
sessions.loc[sessions['event_type'] == "Play", 'duration_ms'] = sessions['track_len']
# @TODO see time between play and skip and in the duration ms put - only the time that the song was plahyed
sessions.loc[sessions['event_type'] == "Skip", 'duration_ms'] = -sessions['track_len']


sessions['timestamp'] = pd.to_datetime(sessions['timestamp'])
sessions['date'] = sessions['timestamp'].dt.strftime('%Y-%m-%d')
sessions['month'] = sessions['timestamp'].dt.strftime('%Y-%m')


sessions['song_played_in_full'] = sessions['event_type'].map({"Play":1,"Skip":-1}).fillna(0)
sessions['song_liked'] = sessions['event_type'].map({"Like":1}).fillna(0)
sessions['Advertisment_played'] = sessions['event_type'].map({"Advertisement":1}).fillna(0)


### Kalkulacje atrybutów

In [None]:
# total time user listened
duration_played_ms = sessions.groupby("user_id")['duration_ms'].sum().reset_index()
users = pd.merge(users, duration_played_ms, on='user_id', how='left', suffixes=('', '_total'))
users.rename(columns={'duration_ms': 'Total_time_listening_ms'}, inplace=True)

# total number of songs played
count_of_songs_played_in_full = sessions.groupby("user_id")["song_played_in_full"].sum().reset_index()
users = pd.merge(users, count_of_songs_played_in_full, on='user_id', how='left', suffixes=('', '_total'))
users.rename(columns={'song_played_in_full': 'Total_nr_of_songs_played'}, inplace=True)

# Number of songs played average day
df = sessions.groupby(["user_id", 'date'])["song_played_in_full"].sum().reset_index()
df.head()
df = df.groupby(["user_id"])["song_played_in_full"].mean().reset_index()
df.rename(columns={'song_played_in_full': 'Nr_of_songs_played_per_day'}, inplace=True)
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))


# length of songs played average  day
df = sessions.groupby(["user_id", 'date'])["duration_ms"].sum().reset_index()
df.head()
df = df.groupby(["user_id"])["duration_ms"].mean().reset_index()
df.rename(columns={'duration_ms': 'Time_listening_per_day_ms'}, inplace=True)
df.describe()
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))

# number of unique songs played
df = sessions.groupby(["user_id", 'track_id'])["song_played_in_full"].sum().reset_index()
df["was_song_played"] = 0
df.loc[df["song_played_in_full"] > 0, 'was_song_played'] = 1
df = df.groupby(["user_id"])["was_song_played"].count().reset_index()
df.rename(columns={'was_song_played': 'Total_nr_of_unique_songs_played'}, inplace=True)
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))

# number of unique songs played per day
# @TODO


# songs played / number of unique songs played (relistening ratio)
users["Relistening_ratio"] = users["Total_nr_of_songs_played"] / users["Total_nr_of_unique_songs_played"]

# number of songs liked and advertisments heard
df = sessions.groupby(["user_id"])["song_liked"].sum().reset_index()
df.rename(columns={'song_liked': 'Total_nr_of_songs_liked'}, inplace=True)
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))

# number of advertisments heard
df = sessions.groupby(["user_id"])["Advertisment_played"].sum().reset_index()
df.rename(columns={'Advertisment_played': 'Total_nr_of_ads_heard'}, inplace=True)
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))

# Number of ads heard per day @TODO chyba wywalić bo zależy od statusu premium
df = sessions.groupby(["user_id", 'date'])["Advertisment_played"].sum().reset_index()
df.head()
df = df.groupby(["user_id"])["Advertisment_played"].mean().reset_index()
df.rename(columns={'Advertisment_played': 'Nr_of_ads_per_day'}, inplace=True)
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))

# Number of songs liked per day
df = sessions.groupby(["user_id", 'date'])["song_liked"].sum().reset_index()
df.head()
df = df.groupby(["user_id"])["song_liked"].mean().reset_index()
df.rename(columns={'song_liked': 'Nr_of_songs_liked_per_day'}, inplace=True)
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))


# Like ratio (likes/ songs played)
users["Like_to_song_ratio_per_day"] = users["Nr_of_songs_liked_per_day"] / users["Nr_of_songs_played_per_day"]


# User age (on the service) in days @TODO pewnie wywalić lub nie używać
df = sessions.groupby(["user_id"])["timestamp"].min().reset_index()
df['timestamp'] = pd.to_datetime(df['timestamp']).dt.tz_localize(None)
now = pd.Timestamp.now(tz=None)
df['User_age_in_days'] = ((now - df['timestamp']).dt.days)
users = pd.merge(users, df, on='user_id', how='left', suffixes=('', '_total'))

# User age (on the service) in years
users["User_age_in_years"] = users['User_age_in_days'] //365



# Binarisation
users["is_premium"] = users["premium_user"].map({True:1,False:-1})


users.head(3)


KeyError: 'Nr_of_unique_songs_played_per_day'

In [None]:
print(users.columns)

Index(['user_id', 'name', 'city', 'street', 'favourite_genres', 'premium_user',
       'total_time_songs_played_ms', 'nr_song_played_in_full',
       'avg_nr_songs_played_per_day', 'avg_nr_songs_played_per_session',
       'avg_time_playing_by_session', 'avg_time_playing_in_a_day',
       'number_of_unique_songs_listened_through', 'nr_of_songs_liked',
       'nr_of_ads_heard', 'nr_of_ads_heard_per_day',
       'nr_of_songs_liked_per_day', 'is_premium'],
      dtype='object')


Unnamed: 0,user_id,timestamp,years_age
0,101,2021-12-22 15:02:00,2
1,102,2022-01-02 01:44:00,2
2,103,2021-10-26 09:01:00,3
3,104,2021-10-11 13:43:00,3
4,105,2022-03-19 21:03:00,2
...,...,...,...
2995,3096,2021-10-18 04:33:00,3
2996,3097,2022-03-17 17:07:00,2
2997,3098,2022-08-30 12:31:00,2
2998,3099,2022-06-09 02:18:00,2


# Analiza wygenerowanych atrybutów

In [None]:
X = users[[
    # 'user_id', 
# 'name', 'city', 'street', 'favourite_genres', 'premium_user',
       'total_time_songs_played_ms', 'nr_song_played_in_full',
       'avg_nr_songs_played_per_day', 'avg_nr_songs_played_per_session',
       'avg_time_playing_by_session', 'avg_time_playing_in_a_day',
       'number_of_unique_songs_listened_through', 'nr_of_songs_liked',
       'nr_of_ads_heard', 'nr_of_ads_heard_per_day',
       'nr_of_songs_liked_per_day', 
    #    'is_premium'
       ]]

Y = users[[
    # 'user_id', 
# 'name', 'city', 'street', 'favourite_genres', 'premium_user',
    #    'total_time_songs_played_ms', 'nr_song_played_in_full',
    #    'avg_nr_songs_played_per_day', 'avg_nr_songs_played_per_session',
    #    'avg_time_playing_by_session', 'avg_time_playing_in_a_day',
    #    'number_of_unique_songs_listened_through', 'nr_of_songs_liked',
    #    'nr_of_ads_heard', 'nr_of_ads_heard_per_day',
    #    'nr_of_songs_liked_per_day', 
       'is_premium'
       ]]



In [None]:

n_attributes = len(X.keys())

n_cols = 2  
n_rows = (n_attributes // n_cols) + (n_attributes % n_cols > 0)  

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 5))

axes = axes.flatten()

for idx, (x, ax) in enumerate(zip(X.keys(), axes)):
    X[x].plot(kind="hist", bins=20, alpha=0.7, ax=ax)
    ax.set_title(f"{x}")
    # ax.set_xlabel(x) 
    ax.set_ylabel('Frequency')  

for idx in range(n_attributes, len(axes)):
    axes[idx].axis('off')

plt.show()


In [None]:
# Standaryzacja wartości w X
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_standardized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# print(X_standardized)


In [None]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()

# Fit the model to the data
model.fit(X_standardized, Y)
Yp = model.predict(X_standardized)

# Get the regression coefficients (parameters)
print("Bias:", model.intercept_)
print("Coefficient:", model.coef_)
# If you have feature names, use them; if not, use numbers as feature labels
feature_names = [f'{str(i)}' for i in X_standardized.keys()]  # Change if you have actual names
weights = model.coef_[0]

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
print("MSE:", mean_squared_error(Y,np.sign(Yp)))
print("R2:", r2_score(Y,np.sign(Yp)))
print("Confusion Matrix:", confusion_matrix(Y,np.sign(Yp)))
# print([1 for v in Y.values if v == 1 ])
print(f"Y 1s {sum([1 for v in Y.values if v == 1 ])}  0s {sum([1 for v in Y.values if v == -1 ])}")
print(f"Yp 1s {sum([1 for v in np.sign(Yp) if v == 1 ])}  0s {sum([1 for v in np.sign(Yp) if v == -1 ])}")



In [None]:

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(feature_names, model.coef_[0], color='skyblue')
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.title('Linear Regression Coefficients')
plt.xticks(rotation=90)
plt.show()
