Add notebooks

yotkadata · Jun 5, 2023 · c148032 · c148032
1 parent 699ce29
commit c148032
Show file tree

Hide file tree

Showing 5 changed files with 6,865 additions and 0 deletions.
diff --git a/notebooks/clean_imdb_data.ipynb b/notebooks/clean_imdb_data.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb_data = pd.read_csv(\"../data/data_with_imdb.csv\", index_col=[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove rows without imdb_id\n",
+    "imdb_data = imdb_data[~imdb_data[\"imdb_id\"].isna()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change format for genre\n",
+    "imdb_data[\"genre\"] = imdb_data[\"genre\"].apply(\n",
+    "    lambda x: \"|\".join(str(x)[1:-1].replace(\"'\", \"\").split(\",\")).replace(\"| \", \" | \")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert dtypes (int)\n",
+    "imdb_data = imdb_data.convert_dtypes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb_data.sort_values(\"movie_id\").reset_index(drop=True).to_csv(\"../data/movies_imdb.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movie = imdb_data[imdb_data[\"movie_id\"] == 1000].copy()\n",
+    "movie[\"director\"][0].split(\"|\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imdb_data"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "recommender",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/data_preparation.ipynb b/notebooks/data_preparation.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from scipy.sparse import csr_matrix\n",
+    "from sklearn.decomposition import NMF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies = pd.read_csv(\"../data/ml-latest-small/movies.csv\")\n",
+    "ratings = pd.read_csv(\"../data/ml-latest-small/ratings.csv\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exclude movies with less than 20 ratings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_ratings_per_movie = ratings.groupby(\"movieId\")[[\"rating\"]].count()\n",
+    "popular_movie_ids = num_ratings_per_movie[num_ratings_per_movie[\"rating\"] > 20].index\n",
+    "\n",
+    "df = ratings[ratings[\"movieId\"].isin(popular_movie_ids)].copy()\n",
+    "df"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remake user and movie ids since they are not sequential"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user_ids = df[\"userId\"].unique()\n",
+    "user_id_map = {v: k for k, v in enumerate(user_ids)}\n",
+    "df[\"user_id\"] = df[\"userId\"].map(user_id_map)\n",
+    "\n",
+    "movie_ids = df[\"movieId\"].unique()\n",
+    "movie_id_map = {v: k for k, v in enumerate(movie_ids)}\n",
+    "df[\"movie_id\"] = df[\"movieId\"].map(movie_id_map)\n",
+    "\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change IDs in movies table, too\n",
+    "df_movies = movies[movies[\"movieId\"].isin(movie_ids)].copy()\n",
+    "df_movies[\"movie_id\"] = df_movies[\"movieId\"].map(movie_id_map)\n",
+    "df_movies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add average rating column\n",
+    "ratings = df.groupby(\"movie_id\")[[\"rating\"]].mean().reset_index()\n",
+    "ratings.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_movies = df_movies.merge(ratings, how=\"left\", left_on=\"movie_id\", right_on=\"movie_id\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save prepared data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[[\"user_id\", \"movie_id\", \"rating\", \"timestamp\"]].sort_values(\n",
+    "    [\"user_id\", \"movie_id\"]\n",
+    ").to_csv(\"../data/ratings_prepared.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_movies[[\"movie_id\", \"title\", \"genres\", \"rating\"]].sort_values(\"movie_id\").to_csv(\n",
+    "    \"../data/movies_prepared.csv\", index=False\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deep_learning",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}