In [2]:
import psycopg2
from psycopg2 import OperationalError
from psycopg2.extensions import register_adapter, AsIs
import numpy as np
psycopg2.extensions.register_adapter(np.int64, psycopg2._psycopg.AsIs)
psycopg2.extensions.register_adapter(np.bool_, psycopg2._psycopg.AsIs)
import json
import pandas as pd
#import math

In [3]:
#get credentials
with open("postgre_credentials.json") as file:
    data = json.load(file)

In [4]:
def reset_connection():
    if 'connection' not in locals():
        connection = psycopg2.connect(
        database = data["db_name"], 
        user = data["db_user"],
        password = data["db_password"],
        host = data["db_host"],
        port = data["db_port"]
        )
        cur = connection.cursor()
        return connection,cur;
    else:
        connection.close()
        connection = psycopg2.connect(
        database = data["db_name"], 
        user = data["db_user"],
        password = data["db_password"],
        host = data["db_host"],
        port = data["db_port"]
        )
        cur = connection.cursor()
        return connection, cur;

In [5]:
connection, cur = reset_connection()

In [5]:
#cur = connection.cursor()

In [6]:
query = "DROP TABLE IF EXISTS anime CASCADE"
cur.execute(query)
connection.commit()

In [6]:
query="DROP TABLE IF EXISTS google_trends CASCADE"
cur.execute(query)
connection.commit()

In [8]:
query="DROP TABLE IF EXISTS show_name CASCADE"
cur.execute(query)
connection.commit()

In [9]:
query="DROP TABLE IF EXISTS date CASCADE"
cur.execute(query)
connection.commit()

In [10]:
query="""CREATE TABLE IF NOT EXISTS anime (
anime_id INTEGER PRIMARY KEY,
eng_name TEXT,
type TEXT,
source TEXT,
episode_count INTEGER,
maturity_rating TEXT,
MAL_score DECIMAL,
scored_count INTEGER,
MAL_rank INTEGER,
MAL_popularity INTEGER,
favorite_count INTEGER,
studio_name TEXT,
licensor TEXT,
air_status TEXT,
is_airing BOOLEAN,
year INTEGER,
season TEXT
)
"""

In [11]:
cur.execute(query)


In [12]:
connection.commit()

In [13]:
query="""CREATE TABLE IF NOT EXISTS date (
date_id SERIAL PRIMARY KEY,
year INTEGER,
month INTEGER
)
"""
cur.execute(query)
connection.commit()

In [14]:
query="""CREATE TABLE IF NOT EXISTS show_name (
show_name_id SERIAL PRIMARY KEY,
alternate_name TEXT,
anime_id INTEGER REFERENCES anime (anime_id)
)
"""
cur.execute(query)
connection.commit()

In [15]:
query="""CREATE TABLE IF NOT EXISTS genre (
genre_id SERIAL PRIMARY KEY,
genre_name TEXT,
anime_id INTEGER REFERENCES anime (anime_id)
)
"""
cur.execute(query)
connection.commit()

In [7]:
query="""CREATE TABLE IF NOT EXISTS google_trends (
google_trends_id SERIAL PRIMARY KEY,
show_name INTEGER REFERENCES show_name(show_name_id),
date INTEGER REFERENCES date(date_id),
popularity_score INT
)
"""
cur.execute(query)
connection.commit()

In [17]:
#Loop through MAL csv file
#insert into anime table first
#split alternative names and loop to insert each show name
#split through genres and insert each genre

In [8]:
def split_column(row, col):
    row_attributes = mal_df.loc[row,col]
    split_row = row_attributes.replace(" ", "").split(",")
    return split_row

In [9]:
def insert_genre(split_row, anime_id):
    if len(split_row)>0:
        for genre in split_row:
            cur.execute("""INSERT INTO genre(genre_name, anime_id)
            VALUES (%s,%s)""",
            (genre, anime_id))
            connection.commit()

In [10]:
def insert_alt_name(split_row, anime_id):
    if len(split_row)>0:
        #print(split_row)
        for name in split_row:
            if name != "NA":
                cur.execute("""INSERT INTO show_name(alternate_name, anime_id)
                VALUES (%s,%s)""",
                (name, anime_id))
                connection.commit()

In [11]:
def insert_anime_info(row):
    try:
        time = row["premiered"].split(" ")
        if time[0] != "Not" and len(time)==3 and "" not in time:
            year = time[1]
            season = time[0]

            cur.execute("""INSERT INTO anime(anime_id, eng_name, type, source,episode_count,
                        maturity_rating, MAL_score, scored_count, MAL_rank, MAL_popularity,
                        favorite_count, studio_name, licensor, air_status, is_airing, year, season)
                        VALUES (%s, %s, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
                       (row["anime_id"],row["title_english"],row["type"], row["source"], row["episodes"],
                       row["rating"], row["score"], row["scored_by"], row["rank"], row["popularity"],
                       row["favorites"], row["studio"], row["licensor"], row["status"], row["airing"],
                       year, season))
            connection.commit()
        
    except: 
        time = row["aired_string"].split("to")[0].split(" ")
        if time[0] != "Not" and len(time)==3 and "" not in time:
            #print(time)
            year = int(time[2])
            season = seasons[time[0]]
            cur.execute("""INSERT INTO anime(anime_id, eng_name, type, source,episode_count,
                        maturity_rating, MAL_score, scored_count, MAL_rank, MAL_popularity,
                        favorite_count, studio_name, licensor, air_status, is_airing, year, season)
                        VALUES (%s, %s, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
                       (row["anime_id"],row["title_english"],row["type"], row["source"], row["episodes"],
                       row["rating"], row["score"], row["scored_by"], row["rank"], row["popularity"],
                       row["favorites"], row["studio"], row["licensor"], row["status"], row["airing"],
                       year, season))
            connection.commit()

        

    


In [12]:
def insert_date(month_year):
    try:
        #print(month_year)
        month = month_year[1]
        year = month_year[0]
        cur.execute("""INSERT INTO date(year, month) 
        VALUES (%s, %s)""",
                   (year,month))
        connection.commit()
    except:
        print(month,year)

In [13]:
#test with 1 anime
with open("Anime_List_Path.txt") as file:
    path = file.read()

mal_df = pd.read_csv(path)

In [14]:
#mal_df.head()
mal_df.loc[:,"producer"] = mal_df.loc[:,"producer"].fillna("NA")
mal_df.loc[:,"licensor"] = mal_df.loc[:,"licensor"].fillna("NA")
mal_df.loc[:,"title_english"] = mal_df.loc[:,"title_english"].fillna("NA")
mal_df.loc[:,"rank"] = mal_df.loc[:,"rank"].fillna(0)
mal_df.loc[:,"genre"] = mal_df.loc[:,"genre"].fillna("NA")
mal_df.loc[:,"title_synonyms"] = mal_df.loc[:,"title_synonyms"].fillna("NA")
mal_df.loc[:,"title_japanese"] = mal_df.loc[:,"title_japanese"].fillna("NA")

In [15]:
seasons = {"Jan":"Winter",
          "Feb": "Winter",
          "Mar": "Spring",
          "Apr": "Spring",
          "May": "Spring",
          "Jun": "Summer",
          "Jul": "Summer",
          "Aug": "Summer",
          "Sep": "Fall",
          "Oct": "Fall",
          "Nov": "Fall",
          "Dec": "Winter"}

## Insert MAL info (Except Alternate Names and Genre)

In [26]:
#insert_anime_info(mal_df.iloc[0])
for idx,row in mal_df.iterrows():
    insert_anime_info(mal_df.iloc[idx])

## Insert Genres

In [27]:
#mal_df.loc[:,"anime_id"]

In [28]:
#testing
for idx, row in mal_df.iterrows():
    split_genres = split_column(idx, "genre")
    insert_genre(split_genres, mal_df.loc[idx,"anime_id"])

## Insert Alternate Names

In [29]:
# mal_df.columns
# print(mal_df.loc[0,"title_synonyms"])
# print(mal_df.loc[0,"title_japanese"])

In [2]:
connection, cur = reset_connection()

NameError: name 'reset_connection' is not defined

In [None]:
for idx, row in mal_df.iterrows():
    try:
        alt_names = split_column(idx, "title_synonyms")
        japanese_names = split_column(idx,"title_japanese")
        eng_name = [mal_df.loc[idx,"title_english"]]
        all_names = eng_name + alt_names + japanese_names
        insert_alt_name(all_names, mal_df.loc[idx,"anime_id"])
    except:
        connection,cur = reset_connection()
        continue


## Insert Dates

In [32]:
#https://stackoverflow.com/questions/5734438/how-to-create-a-month-iterator
def month_year_iter( start_month, start_year, end_month, end_year ):
    ym_start= 12*start_year + start_month - 1
    ym_end= 12*end_year + end_month - 1
    for ym in range( ym_start, ym_end ):
        y, m = divmod( ym, 12 )
        yield y, m+1

date_generator = month_year_iter(1,2004,1,2021)


In [33]:
for date in range(204):
    insert_date(next(date_generator))

In [29]:
connection.close()