In [None]:
from pathlib import Path

import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score


%matplotlib inline

In [None]:
# Exploratory Data analysis

df = pd.read_csv('raw_titles.csv')
df.size
print(df)
# Display the first few rows of the dataset
print(df.head())

# Get general information about the dataset
print(df.info())

# Summary statistics of numerical columns
print(df.describe())
# Checking for duplicate records
df.duplicated().value_counts()
import ast  # Library to convert string to list
# Convert string representations to lists
df['genres'] = df['genres'].apply(ast.literal_eval)
df['production_countries'] = df['production_countries'].apply(ast.literal_eval)

# Extract the first value from each list in the 'genres' column
df['primary_genre'] = df['genres'].apply(lambda x: x[0] if len(x) > 0 else None)
df['primary_country'] = df['production_countries'].apply(lambda x: x[0] if len(x) > 0 else None)

print(df.head())

# Check for missing values
print(df.isnull().sum())

df[['age_certification','imdb_id','title', 'seasons']] = df[['age_certification','imdb_id','title', 'seasons']].fillna('Unknown')
df['imdb_score'] = df['imdb_score'].fillna(df['imdb_score'].mode()[0])
df['imdb_votes'] = df['imdb_votes'].fillna(df['imdb_votes'].mode()[0])
df.dropna(axis=0, inplace = True)

# Number of Movies and TV Shows in the dataset
plt.figure(figsize=(7,7))
df.type.value_counts().plot(kind='pie',autopct='%1.2f%%')
plt.ylabel('')
plt.title('Movies and TV Shows in the dataset')

# Top 5 countries with the highest number movies / TV shows in the dataset
plt.figure(figsize=(10,5))
df[~(df['primary_country']=='Unknown')].primary_country.value_counts().nlargest(5).plot(kind='barh')
plt.title(' Top 5 countries with the highest number of shows')

# Histogram of IMDb scores
plt.figure(figsize=(8, 6))
sns.histplot(df['imdb_score'], bins=20, kde=True)
plt.xlabel('IMDb Score')
plt.title('Distribution of IMDb Scores')
plt.show()

# Box plot of runtime by type (Movie/TV Show)
plt.figure(figsize=(8, 6))
sns.boxplot(x='type', y='runtime', data=df)
plt.xlabel('Type')
plt.ylabel('Runtime (minutes)')
plt.title('Runtime Distribution by Type')
plt.show()

# Changing the values in the rating column
rating_map = {'TV-MA':'Adults',
              'R':'Adults',
              'PG-13':'Teens',
              'TV-14':'Young Adults',
              'TV-PG':'Older Kids',
              'NR':'Adults',
              'TV-G':'Kids',
              'TV-Y':'Kids',
              'TV-Y7':'Older Kids',
              'PG':'Older Kids',
              'G':'Kids',
              'NC-17':'Adults',
              'TV-Y7-FV':'Older Kids',
              'UR':'Adults'}

df['age_certification'].replace(rating_map, inplace = True)
df['age_certification'].unique()

# Age ratings for shows in the dataset
plt.figure(figsize=(10,5))
sns.countplot(x='age_certification',data=df)

# Number of shows on Netflix for different age groups
plt.figure(figsize=(10,5))
df.age_certification.value_counts().plot(kind='barh')
plt.title('Number of shows on Netflix for different age groups')
# Extracting release decade from 'release_year'
df['release_decade'] = (df['release_year'] // 10) * 10

# Average IMDb scores per release decade
avg_scores_per_decade = df.groupby('release_decade')['imdb_score'].mean()
print(avg_scores_per_decade)
# Visualizing the year in which the movie / tv show was released
plt.figure(figsize=(10,5))
sns.histplot(df['release_decade'])
plt.title('distribution by released decade')
#Separate the data for movies and TV shows
movies = df[df['type'] == 'MOVIE']
tv_shows = df[df['type'] == 'SHOW']

# Count plot of first genre for movies
plt.figure(figsize=(10,5))
movies.primary_genre.value_counts().nlargest(5).plot(kind='barh')
plt.title('Top 5 movie genres')

# Count plot of first genre for tv shows
plt.figure(figsize=(10,5))
tv_shows.primary_genre.value_counts().nlargest(5).plot(kind='barh')
plt.title('Top 5 tv show genres')

# Box plot of imbd_score by top 5 TV Show Genres
top_5_show_genres = tv_shows['primary_genre'].value_counts().head(5).index
plt.figure(figsize=(8, 6))
sns.boxplot(x='primary_genre', y='imdb_score', data=tv_shows, order=top_5_show_genres)
plt.xlabel('Show Genre')
plt.ylabel('imdb_score')
plt.title('IMBD Score Distribution by top 5 TV Show Genres')
plt.show()

# Box plot of imbd_score by top 5 TV Movie Genres
top_5_movie_genres = movies['primary_genre'].value_counts().head(5).index
plt.figure(figsize=(8, 6))
sns.boxplot(x='primary_genre', y='imdb_score', data=movies, order=top_5_movie_genres)
plt.xlabel('Movie Genre')
plt.ylabel('imdb_score')
plt.title('IMBD Score Distribution by top 5 Movie Genres')
plt.show()
# Share of top 5 movies genres
movies.primary_genre.value_counts().nlargest(5).sum()/len(movies)*100
# Share of top 5 show genres
tv_shows.primary_genre.value_counts().nlargest(5).sum()/len(tv_shows)*100
# Distribution of IMDb scores for movies
plt.figure(figsize=(8, 6))
sns.histplot(movies['imdb_score'], bins=20, kde=True, color='blue', label='Movies')
plt.xlabel('IMDb Score')
plt.title('Distribution of IMDb Scores for Movies')
plt.legend()
plt.show()

# Distribution of IMDb scores for TV shows
plt.figure(figsize=(8, 6))
sns.histplot(tv_shows['imdb_score'], bins=20, kde=True, color='red', label='TV Shows')
plt.xlabel('IMDb Score')
plt.title('Distribution of IMDb Scores for TV Shows')
plt.legend()
plt.show()
#Analyze scatter plots to understand the relationship between IMDb scores 
#and IMDb votes for both types of media.
plt.figure(figsize=(8, 6))
sns.histplot(movies['imdb_score'], bins=20, kde=True, color='blue', label='Movies')
sns.histplot(tv_shows['imdb_score'], bins=20, kde=True, color='red', label='TV Shows')
plt.xlabel('IMDb Score')
plt.title('Distribution of IMDb Scores for Movies and TV Shows')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(movies['imdb_votes'], bins=20, kde=True, color='blue', label='Movies')
sns.histplot(tv_shows['imdb_votes'], bins=20, kde=True, color='red', label='TV Shows')
plt.xlabel('IMDb Votes')
plt.title('Distribution of IMDb Votes for Movies and TV Shows')
plt.legend()
plt.show()

# Scatter plot of IMDb scores vs. IMDb votes for movies
plt.figure(figsize=(8, 6))
sns.scatterplot(x='imdb_score', y='imdb_votes', data=movies)
plt.title('IMDb Score vs. IMDb Votes for Movies')
plt.xlabel('IMDb Score')
plt.ylabel('IMDb Votes')
plt.show()

# Scatter plot of IMDb scores vs. IMDb votes for shows
plt.figure(figsize=(8, 6))
sns.scatterplot(x='imdb_score', y='imdb_votes', data=tv_shows)
plt.title('IMDb Score vs. IMDb Votes for TV Shows')
plt.xlabel('IMDb Score')
plt.ylabel('IMDb Votes')
plt.show()
# Selecting relevant features for clustering
features = df[['release_year', 'runtime','imdb_score', 'imdb_votes']]



In [None]:
# Analysis 1 - Kmeans Clustering

#Scale the features to have a similar range using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

#Use techniques Elbow Method or Silhouette Score to determine the optimal number of clusters
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Using the Elbow Method to determine the optimal number of clusters
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()

# Plotting Silhouette score for different number of clusters
range_n_clusters = range(2,10)
silhouette_avg = []
for num_clusters in range_n_clusters:
  # initialize kmeans
  kmeans = KMeans(n_clusters=num_clusters,init='k-means++',random_state=33)
  kmeans.fit(scaled_features)
  cluster_labels = kmeans.labels_
 
  # silhouette score
  silhouette_avg.append(silhouette_score(scaled_features, cluster_labels))

plt.figure(figsize=(10,5))
plt.plot(range_n_clusters,silhouette_avg)
plt.xlabel('Values of K') 
plt.ylabel('Silhouette score')
plt.title('Silhouette analysis For Optimal k - KMeans clustering')
plt.show()
# Choose the optimal K value and perform K-means clustering
optimal_k = 4  # Example value based on the analysis
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

# Add cluster labels to the DataFrame
df['cluster'] = clusters
#Visualize the clusters in a 2D space
plt.figure(figsize=(8, 6))
plt.scatter(df['imdb_score'], df['imdb_votes'], c=clusters, cmap='viridis', alpha=0.5)
plt.xlabel('IMDb Score')
plt.ylabel('IMDb Votes')
plt.title('Clusters of Movies/TV Shows based on IMDb Scores and Votes')
plt.colorbar(label='Cluster')
plt.show()


import seaborn as sns
# Visualizing the clusters (pairwise scatterplot of features)
sns.pairplot(df, hue='cluster', vars=features, palette='viridis')
plt.show()
# Number of movies and tv shows in each cluster
plt.figure(figsize=(10,5))
q = sns.countplot(x='cluster',data=df, hue='type')
plt.title('Number of movies and TV shows in each cluster - Kmeans Clustering')
for i in q.patches:
  q.annotate(format(i.get_height(), '.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
             ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# Number of movies and tv shows in each cluster
plt.figure(figsize=(10,5))
q = sns.countplot(x='cluster',data=df, hue='release_decade')
plt.title('Release Decade in each cluster - Kmeans Clustering')
for i in q.patches:
  q.annotate(format(i.get_height(), '.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
             ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

# Evaluation metrics - distortion, Silhouette score
kmeans_distortion = kmeans.inertia_
kmeans_silhouette_score = silhouette_score(scaled_features, kmeans.labels_)

print((kmeans_distortion,kmeans_silhouette_score))

In [None]:
# Analysis 2 - KNN MODEL
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import ast

In [None]:
# Convert string representations to lists
df['genres'] = df['genres'].apply(ast.literal_eval)
df['production_countries'] = df['production_countries'].apply(ast.literal_eval)

# Extract the first value from each list in the 'genres' column
df['first_genre'] = df['genres'].apply(lambda x: x[0] if len(x) > 0 else None)
df['production_countries'] = df['production_countries'].apply(lambda x: x[0] if len(x) > 0 else None)

print(df[['title', 'first_genre', 'production_countries']])

# handling missing value 
# imputed missing values in 'imdb_score' using the mean value
df[['age_certification','imdb_id','title', 'seasons']] = df[['age_certification','imdb_id','title', 'seasons']].fillna('Unknown')
df['imdb_score'] = df['imdb_score'].fillna(df['imdb_score'].mode()[0])
df['imdb_votes'] = df['imdb_votes'].fillna(df['imdb_votes'].mode()[0])
df.dropna(axis=0, inplace = True)

# Define features (X) and target variable (y)
features = ['release_year', 'age_certification', 'first_genre', 'production_countries']
X = pd.get_dummies(df[features])
y = df['imdb_score']

trainData, validData = train_test_split(df, test_size=0.4, random_state=2)

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=2)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=2)


# Create and fit the KNN model
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Generate new data point for prediction
new_data = {
    'release_year': [1997],
    'age_certification': ['PG-13'], 
    'first_genre': ['action'],
    'production_countries': ['US']
}
new_data_df = pd.DataFrame(new_data)

# Ensure that the new data point has the same columns as the training data after one-hot encoding
new_data_df = pd.get_dummies(new_data_df)
new_data_df = new_data_df.reindex(columns=X.columns, fill_value=0)

# Predict the IMDB_score for the new data point
predicted_score = knn_model.predict(new_data_df)

# Print the predicted IMDB_score
print("Predicted IMDb Score:", predicted_score[0])

# Get the neighbors of the new data point
neighbors_indices = knn_model.kneighbors(new_data_df, return_distance=False)[0]


# Plot the scatter plot to visualize neighbors
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')

# Plot existing data points
ax.scatter(X['release_year'], X['age_certification_PG-13'], y, c='blue', marker='o', label='Existing Data')

# Plot the new data point
ax.scatter(new_data_df['release_year'], new_data_df['age_certification_PG-13'], predicted_score, c='red', marker='x', s=100, label='New Data Point')

# Plot neighbors
ax.scatter(X.iloc[neighbors_indices]['release_year'], X.iloc[neighbors_indices]['age_certification_PG-13'], y.iloc[neighbors_indices],
           c='green', marker='^', s=50, label='Neighbors')

ax.set_xlabel('Release Year')
ax.set_ylabel('Age Certification (PG-13)')
ax.set_zlabel('IMDb Score')

plt.legend()
plt.show()

# Predict IMDb scores for the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Train a classifier for different values of k and compute accuracy on validation data
k_values = list(range(1, 101))
accuracy_results = []

for k in k_values:
    knn_model = KNeighborsRegressor(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    
    # Predict IMDb scores for the validation set
    y_pred_valid = knn_model.predict(X_valid)
    
    # Evaluate the model on validation data
    mse_valid = mean_squared_error(y_valid, y_pred_valid)
    
    # Compute accuracy (you might want to adjust this based on your problem)
    accuracy_valid = knn_model.score(X_valid, y_valid)
    
    accuracy_results.append({
        'k': k,
        'mse_valid': mse_valid,
        'accuracy_valid': accuracy_valid
    })

# Plotting accuracy for different k values
plt.figure(figsize=(10, 6))
plt.plot([result['k'] for result in accuracy_results], [result['accuracy_valid'] for result in accuracy_results], marker='o')
plt.title('Accuracy for Different k Values on Validation Data')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

# Find the best k based on the highest accuracy
best_k_result = max(accuracy_results, key=lambda x: x['accuracy_valid'])
best_k = best_k_result['k']
best_accuracy = best_k_result['accuracy_valid']
print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

# Create and fit the KNN model
knn_model = KNeighborsRegressor(n_neighbors=best_k)  # Using the previously found best k value
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

In [None]:
# Analysis 3 - REGRESSION TREES
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
! pip install dmba
from dmba import classificationSummary

In [None]:
# Drop unnecessary columns
def drop_columns(csv_file, columns_to_drop):
    # Read CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    df_dropped = df.drop(columns=columns_to_drop, errors='ignore')

    return df_dropped

csv_file_path = 'raw_titles.csv'
columns_to_drop = ['index', 'id', 'title', 'age_certification', 'genres', 'production_countries', 'seasons', 'imdb_id', 'imdb_score', 'imdb_votes']

modified_netflix_df = drop_columns(csv_file_path, columns_to_drop)

shows_df = modified_netflix_df[modified_netflix_df['type'].str.strip() == 'SHOW'].copy()
movies_df = modified_netflix_df[modified_netflix_df['type'].str.strip() == 'MOVIE'].copy()

print("Modified DataFrame:")
print(modified_netflix_df)

print("\nSHOWS DataFrame:")
print(shows_df)

print("\nMOVIES DataFrame:")
print(movies_df)

# Modified DataFrame by dropping specific columns
modified_netflix_df = drop_columns(csv_file_path, columns_to_drop)

# Separate DataFrame for shows
shows_df = modified_netflix_df[modified_netflix_df['type'].str.strip() == 'SHOW'].copy()

# Split data into features and target variable
X = shows_df[['release_year']]
y = shows_df['runtime']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a regression tree
reg_tree = DecisionTreeRegressor(max_depth=2)
reg_tree.fit(X_train, y_train)

# Visualize the regression tree
plt.figure(figsize=(15, 10))
plot_tree(reg_tree, feature_names=['Shows Release Year'], filled=True, rounded=True, impurity=False, fontsize=8, precision=0, label='all')

# Save the regression tree
plt.savefig('decision_tree_shows_release_year.png')

# Separate DataFrame for movies
movies_df = modified_netflix_df[modified_netflix_df['type'].str.strip() == 'MOVIE'].copy()

# Split data into features and target variable
X_movies = movies_df[['release_year']]
y_movies = movies_df['runtime']

# Split data into training and test sets
X_train_movies, X_test_movies, y_train_movies, y_test_movies = train_test_split(X_movies, y_movies, test_size=0.2, random_state=42)

# Create a regression tree
reg_tree_movies = DecisionTreeRegressor(max_depth=2, ccp_alpha=0.001)
reg_tree_movies.fit(X_train_movies, y_train_movies)

# Visualize the regression tree
plt.figure(figsize=(15, 10))
plot_tree(reg_tree_movies, feature_names=['Movies Release Year'], filled=True, rounded=True, impurity=False, fontsize=8, precision=0, label='all')

# Save the regression tree
plt.savefig('decision_tree_movies_release_year.png')

# Separate DataFrame for movies
movies_df = modified_netflix_df[modified_netflix_df['type'].str.strip() == 'MOVIE'].copy()

# Split data into features and target variable
X_movies = movies_df[['runtime']]
y_movies = movies_df['runtime']

# Split data into training and test sets
X_train_movies, X_test_movies, y_train_movies, y_test_movies = train_test_split(X_movies, y_movies, test_size=0.2, random_state=10)

# Create a regression tree
reg_tree_movies = DecisionTreeRegressor(max_depth=2, ccp_alpha=0.001)  # Adjust the ccp_alpha value as needed
reg_tree_movies.fit(X_train_movies, y_train_movies)

# Visualize the regression tree
plt.figure(figsize=(15, 10))
plot_tree(reg_tree_movies, feature_names=['Movies Runtime'], filled=True, rounded=True, impurity=False, fontsize=8, precision=0, label='all')

# Save the regression tree
plt.savefig('decision_tree_movies_runtime.png')

# Separate DataFrame for shows
shows_df = modified_netflix_df[modified_netflix_df['type'].str.strip() == 'SHOW'].copy()

# Split data into features and target variable
X_shows = shows_df[['runtime']]
y_shows = shows_df['runtime']

# Split data into training and test sets
X_train_shows, X_test_shows, y_train_shows, y_test_shows = train_test_split(X_shows, y_shows, test_size=0.2, random_state=10)

# Create a regression tree
reg_tree_shows = DecisionTreeRegressor(max_depth=2, ccp_alpha=0.001)  # Adjust the ccp_alpha value as needed
reg_tree_shows.fit(X_train_shows, y_train_shows)

# Visualize the regression tree
plt.figure(figsize=(15, 10))
plot_tree(reg_tree_shows, feature_names=['Shows Runtime'], filled=True, rounded=True, impurity=False, fontsize=8, precision=0, label='all')

# Save the regression tree
plt.savefig('decision_tree_shows_runtime.png')

In [None]:
# Analysis 4 - ASSOCIATION RULES
# import all the required packages
from pathlib import Path
import pandas as pd
import ast
# ! pip install mlxtend --user
import mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from IPython.display import display

# load the file
df_new = pd.read_csv('raw_titles.csv')

# data processing - genre was an object, converted it to a list. Relevant for the next step.
genres_list = [ast.literal_eval(genre) for genre in df_new["genres"]]
df_new["genre_list"] = genres_list

# extracting the first genre from the genre column
df_new["first_genre"] = df_new["genre_list"].apply(lambda x: x[0] if len(x) > 0 else None)

# extracting the second genre from the genre column
df_new["second_genre"] = df_new["genre_list"].apply(lambda x: x[1:2] if len(x) > 0 else None)

# converting the second genre column to string as it was in list type
df_new["second_genre_string"] = df_new["second_genre"].apply(
    lambda x: ", ".join([str(item) for item in x]) if x is not None else None)

# extracting the third genre from the genre column
df_new["third_genre"] = df_new["genre_list"].apply(lambda x: x[2:3] if len(x) > 0 else None)

# converting the third genre column to string as it was in list type
df_new["third_genre_string"] = df_new["third_genre"].apply(
    lambda x: ", ".join([str(item) for item in x]) if x is not None else None)

# In a pivot table, values takes only int columns. This is why we convert the ID column. 
# Identify numeric parts of IDs
df_new["id_numeric"] = df_new["id"].str.extract("(\d+)")

# Convert numeric parts to integers
df_new["id_numeric"] = pd.to_numeric(df_new["id_numeric"], errors="coerce")

# Pivot table with first genre column. This forms a binary matrix.
pivot_table = pd.pivot_table(
    df_new,
    values="id_numeric",
    index="id",
    columns="first_genre",
    aggfunc="count",
    fill_value=0)

# Pivot table with second genre column. This forms a binary matrix. 
# We have dropped the column where some data records did not have more than 1 genre in the list
pivot_table2 = pd.pivot_table(
    df_new,
    values="id_numeric",
    index="id",
    columns="second_genre_string",
    aggfunc="count",
    fill_value=0,
)
pivot_table2.drop(columns = [""], inplace = True)

# Pivot table with third genre column. This forms a binary matrix. 
# We have dropped the column where some data records did not have more than 2 genre in the list
pivot_table3 = pd.pivot_table(
    df_new,
    values="id_numeric",
    index="id",
    columns="third_genre_string",
    aggfunc="count",
    fill_value=0,
)
pivot_table3.drop(columns = [""], inplace = True)
pivot_table3

#Summing the three pivot tables
summed_df = pd.concat([pivot_table, pivot_table2, pivot_table3]).groupby('id').sum()

# Creating itemsets using apriori, the minimum support here is 1% so there are enough association rules
itemsets = apriori(summed_df, min_support=0.01, use_colnames=True)

# Creating rules based on itemsets (created in the previous step), here, taking metric as lift and minimum threshold
# for this metric is 1. This is because, theoratically, Lift value > 1 implies a strong relationship between the two items
rules = association_rules(itemsets, metric='lift', min_threshold=1)

# Sorting the values by lift, in descending order
final_output = rules.sort_values(by=['lift'], ascending=False)

# if required, only see the columns that are important:
# final_output = rules.sort_values(by=['lift'], ascending=False)
# .drop(columns=['antecedent support', 'consequent support','conviction], inplace=True)

# Displaying all the data records in the output
with pd.option_context('display.max_rows', None):
    print(final_output)