Skip to content

Commit

Permalink
Add notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
yotkadata committed Jun 5, 2023
1 parent 699ce29 commit c148032
Show file tree
Hide file tree
Showing 5 changed files with 6,865 additions and 0 deletions.
104 changes: 104 additions & 0 deletions notebooks/clean_imdb_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"imdb_data = pd.read_csv(\"../data/data_with_imdb.csv\", index_col=[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Remove rows without imdb_id\n",
"imdb_data = imdb_data[~imdb_data[\"imdb_id\"].isna()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Change format for genre\n",
"imdb_data[\"genre\"] = imdb_data[\"genre\"].apply(\n",
" lambda x: \"|\".join(str(x)[1:-1].replace(\"'\", \"\").split(\",\")).replace(\"| \", \" | \")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert dtypes (int)\n",
"imdb_data = imdb_data.convert_dtypes()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"imdb_data.sort_values(\"movie_id\").reset_index(drop=True).to_csv(\"../data/movies_imdb.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"movie = imdb_data[imdb_data[\"movie_id\"] == 1000].copy()\n",
"movie[\"director\"][0].split(\"|\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"imdb_data"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "recommender",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
164 changes: 164 additions & 0 deletions notebooks/data_preparation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data preparation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from scipy.sparse import csr_matrix\n",
"from sklearn.decomposition import NMF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv(\"../data/ml-latest-small/movies.csv\")\n",
"ratings = pd.read_csv(\"../data/ml-latest-small/ratings.csv\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exclude movies with less than 20 ratings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_ratings_per_movie = ratings.groupby(\"movieId\")[[\"rating\"]].count()\n",
"popular_movie_ids = num_ratings_per_movie[num_ratings_per_movie[\"rating\"] > 20].index\n",
"\n",
"df = ratings[ratings[\"movieId\"].isin(popular_movie_ids)].copy()\n",
"df"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Remake user and movie ids since they are not sequential"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"user_ids = df[\"userId\"].unique()\n",
"user_id_map = {v: k for k, v in enumerate(user_ids)}\n",
"df[\"user_id\"] = df[\"userId\"].map(user_id_map)\n",
"\n",
"movie_ids = df[\"movieId\"].unique()\n",
"movie_id_map = {v: k for k, v in enumerate(movie_ids)}\n",
"df[\"movie_id\"] = df[\"movieId\"].map(movie_id_map)\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Change IDs in movies table, too\n",
"df_movies = movies[movies[\"movieId\"].isin(movie_ids)].copy()\n",
"df_movies[\"movie_id\"] = df_movies[\"movieId\"].map(movie_id_map)\n",
"df_movies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Add average rating column\n",
"ratings = df.groupby(\"movie_id\")[[\"rating\"]].mean().reset_index()\n",
"ratings.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_movies = df_movies.merge(ratings, how=\"left\", left_on=\"movie_id\", right_on=\"movie_id\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save prepared data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[[\"user_id\", \"movie_id\", \"rating\", \"timestamp\"]].sort_values(\n",
" [\"user_id\", \"movie_id\"]\n",
").to_csv(\"../data/ratings_prepared.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_movies[[\"movie_id\", \"title\", \"genres\", \"rating\"]].sort_values(\"movie_id\").to_csv(\n",
" \"../data/movies_prepared.csv\", index=False\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit c148032

Please sign in to comment.