# Laboratory work N1
## Imports and settings

In [1]:
import numpy as np
import pandas as pd

from typing import Union, Any
import configparser

import psycopg2
from psycopg2.extras import DictCursor, RealDictCursor

from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

import json
import datetime
import time

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def config(filename :str = 'database.ini', section: str ='postgresql') -> dict[str, Any]:
    parser = configparser.ConfigParser()
    parser.read(filename)
    if parser.has_section(section):
        params = parser.items(section)
        return dict(params)
    else: raise Exception('Invalid .ini file')

## Utils

In [4]:
import re

def camel_to_snake(string: str) -> str:
    string = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', string)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', string).lower()

def remove_consecutive_underscores(string: str) -> str:
    return re.sub('_+', '_', string)
        
def get_rename_mapping(columns: list[str]) -> dict[str, str]:
    renamed_columns = map(remove_consecutive_underscores,
                          map(camel_to_snake, columns))
    mapping = zip(columns, renamed_columns)
    return dict(mapping)

def extract_number(string: str) -> str:
    numbers = re.findall('\d+', str(string))
    return numbers[0] if numbers else None

In [5]:
def get_intersection(f_series: pd.Series, s_series: pd.Series) -> pd.Series:
    return pd.Series(np.intersect1d(f_series.dropna(), s_series.dropna()))

In [6]:
def extract_date(data: pd.Series) -> pd.DataFrame:
    dates = pd.to_datetime(data, errors='coerce')
    df = pd.DataFrame({
        'day': dates.dt.day,
        'month': dates.dt.month,
        'year': dates.dt.year,
    })
    return df

In [7]:
def get_values_string(values_number: int) -> str:
    return '(' + ('%s,' * values_number)[:-1] + ')'

## Process IMDB movies dataset

In [8]:
movies = pd.read_csv('datasets/imdb.csv')
movies

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85850,tt9908390,Le lion,Le lion,2020,2020-01-29,Comedy,95,"France, Belgium",French,Ludovic Colbeau-Justin,"Alexandre Coquelle, Matthieu Le Naour",Monkey Pack Films,"Dany Boon, Philippe Katerine, Anne Serra, Samu...",A psychiatric hospital patient pretends to be ...,5.3,398,,,$ 3507171,,,4.0
85851,tt9911196,De Beentjes van Sint-Hildegard,De Beentjes van Sint-Hildegard,2020,2020-02-13,"Comedy, Drama",103,Netherlands,"German, Dutch",Johan Nijenhuis,"Radek Bajgar, Herman Finkers",Johan Nijenhuis & Co,"Herman Finkers, Johanna ter Steege, Leonie ter...",A middle-aged veterinary surgeon believes his ...,7.7,724,,,$ 7299062,,6.0,4.0
85852,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,"Vineesh Aaradya, Vineesh Aaradya",RMCC Productions,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",,7.9,265,,,,,,
85853,tt9914286,Sokagin Çocuklari,Sokagin Çocuklari,2019,2019-03-15,"Drama, Family",98,Turkey,Turkish,Ahmet Faik Akinci,"Ahmet Faik Akinci, Kasim Uçkan",Gizem Ajans,"Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,...",,6.4,194,,,$ 2833,,,


## Process oscar ceremonies dataset

In [9]:
oscars = pd.read_csv('datasets/oscars.csv')
oscars

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
...,...,...,...,...,...,...,...
10390,2019,2020,92,WRITING (Original Screenplay),"Screenplay by Bong Joon Ho, Han Jin Won; Story...",Parasite,True
10391,2019,2020,92,JEAN HERSHOLT HUMANITARIAN AWARD,Geena Davis,,True
10392,2019,2020,92,HONORARY AWARD,David Lynch,,True
10393,2019,2020,92,HONORARY AWARD,Wes Studi,,True


## Process Fandango dataset

In [10]:
ranking = pd.read_csv('datasets/ranking.csv')
ranking

Unnamed: 0,FILM,RottenTomatoes,RottenTomatoes_User,Metacritic,Metacritic_User,IMDB,Fandango_Stars,Fandango_Ratingvalue,RT_norm,RT_user_norm,Metacritic_norm,Metacritic_user_nom,IMDB_norm,RT_norm_round,RT_user_norm_round,Metacritic_norm_round,Metacritic_user_norm_round,IMDB_norm_round,Metacritic_user_vote_count,IMDB_user_vote_count,Fandango_votes,Fandango_Difference
0,Avengers: Age of Ultron (2015),74,86,66,7.1,7.8,5.0,4.5,3.70,4.30,3.30,3.55,3.90,3.5,4.5,3.5,3.5,4.0,1330,271107,14846,0.5
1,Cinderella (2015),85,80,67,7.5,7.1,5.0,4.5,4.25,4.00,3.35,3.75,3.55,4.5,4.0,3.5,4.0,3.5,249,65709,12640,0.5
2,Ant-Man (2015),80,90,64,8.1,7.8,5.0,4.5,4.00,4.50,3.20,4.05,3.90,4.0,4.5,3.0,4.0,4.0,627,103660,12055,0.5
3,Do You Believe? (2015),18,84,22,4.7,5.4,5.0,4.5,0.90,4.20,1.10,2.35,2.70,1.0,4.0,1.0,2.5,2.5,31,3136,1793,0.5
4,Hot Tub Time Machine 2 (2015),14,28,29,3.4,5.1,3.5,3.0,0.70,1.40,1.45,1.70,2.55,0.5,1.5,1.5,1.5,2.5,88,19560,1021,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Mr. Holmes (2015),87,78,67,7.9,7.4,4.0,4.0,4.35,3.90,3.35,3.95,3.70,4.5,4.0,3.5,4.0,3.5,33,7367,1348,0.0
142,'71 (2015),97,82,83,7.5,7.2,3.5,3.5,4.85,4.10,4.15,3.75,3.60,5.0,4.0,4.0,4.0,3.5,60,24116,192,0.0
143,"Two Days, One Night (2014)",97,78,89,8.8,7.4,3.5,3.5,4.85,3.90,4.45,4.40,3.70,5.0,4.0,4.5,4.5,3.5,123,24345,118,0.0
144,Gett: The Trial of Viviane Amsalem (2015),100,81,90,7.3,7.8,3.5,3.5,5.00,4.05,4.50,3.65,3.90,5.0,4.0,4.5,3.5,4.0,19,1955,59,0.0


## Postgres utils
### QuerySet wrapper

In [11]:
class QuerySet:
    def __init__(self, query_set):
        self.query_set = query_set

    def all(self):
        if self.query_set:
            return self.query_set.fetchall()
        else: return None

    def one(self):
        if self.query_set:
            return self.query_set.fetchone()
        else: return None

### Postgres API

In [12]:
class PgAPI:
    @staticmethod
    def get_db():
        params = config()
        conn = psycopg2.connect(**params)
        conn.autocommit = True
        return conn
        
    @staticmethod
    def get_cursor(cursor_factory=None):
        return PgAPI.get_db().cursor(cursor_factory=cursor_factory)

    @staticmethod
    def execute_query(query: str, *args) -> QuerySet:
        cursor = PgAPI.get_cursor()
        cursor.execute(query, args)
        return QuerySet(cursor)

    @staticmethod
    def execute_dict_query(query: str, *args) -> QuerySet:
        cursor = PgAPI.get_cursor(DictCursor)
        cursor.execute(query, args)
        return QuerySet(cursor)
    
    @staticmethod
    def execute_rdict_query(query: str, *args) -> QuerySet:
        cursor = PgAPI.get_cursor(RealDictCursor)
        cursor.execute(query, args)
        return QuerySet(cursor)

    @staticmethod
    def execute_call(query: str, *args) -> None:
        cursor = PgAPI.get_cursor()
        cursor.execute(query, args)

### Database utils (init / drop / clear)

In [13]:
def init_db(file_name: str = 'schema.sql') -> None:
    with open(file_name, 'r') as fhand:
        query = fhand.read()
        PgAPI.execute_call(query)

def init_dates() -> None:
    query = 'CALL fill_dates()'
    PgAPI.execute_call(query)

def clear_db() -> None:
    t_query = "SELECT tablename FROM pg_tables\
        WHERE schemaname='public'"
    query_set = PgAPI.execute_query(t_query).all()
    for record in query_set:
        query = 'TRUNCATE ' + record[0] + ' CASCADE'
        PgAPI.execute_call(query)

### Create SQLAlchemy engine

In [14]:
params = config()
params['username'] = params.pop('user')  # rename key
params['drivername'] = 'postgresql'
conn_url = URL(**params)
engine = create_engine(conn_url)

### QueryBuilder

In [15]:
def select(t_name: str, columns: list[str]) -> str:
    q_columns = ', '.join(columns)
    q_from = ' '.join(['FROM', t_name])
    query = ' '.join(['SELECT', q_columns, q_from])
    return query

In [16]:
def insert(t_name: str, df: pd.DataFrame) -> None:
    df.to_sql(t_name, con=engine, if_exists='append', index=False)

In [17]:
def where(t_name: str, column_name: str, values: list[Any]) -> list[int]:
    t_id = t_name[:-1] + '_id'
    query = f'SELECT {t_id} FROM {t_name} WHERE {column_name} = %s'
    query_set = PgAPI.execute_query(query, *values)
    return query_set.all()

## Providers beetwen pandas dataframes and database tables

### Insert unique data from dataset to the database table

In [18]:
def insert_unique(t_name: str, df: pd.DataFrame, fields: Union[str, list[str], dict[str, str]]) -> None:
    if isinstance(fields, str): fields = [fields]
    
    if isinstance(fields, dict):
        df_cols, db_cols = zip(*fields.items())
    else:
        df_cols = db_cols = tuple(fields)
    
    query = select(t_name, db_cols)
    data = PgAPI.execute_dict_query(query).all()
    df_data = df[list(df_cols)].drop_duplicates()
    mapping = dict(zip(df_cols, db_cols))
    
    if not data:
        insert(t_name, df_data.rename(columns=mapping))
        return
    
    db_data = pd.DataFrame(data, columns=db_cols)
    new_data = df_data.merge(
        right=db_data, how='left',
        left_on=df_cols, right_on=db_cols,
        indicator=True
    )
    drops = ['_merge', *set(db_cols).difference(df_cols)]
    
    to_insert = new_data[new_data['_merge']=='left_only']\
        .drop(columns=drops)\
        .rename(columns=mapping)
        
    insert(t_name, to_insert)

### Join dataframe and database table

In [19]:
def join(t_name: str, df: pd.DataFrame, fields: Union[str, list[str], dict[str, str]], out: str) -> pd.DataFrame:
    if isinstance(fields, str): fields = [fields]
        
    if isinstance(fields, dict):
        df_cols, db_cols = zip(*fields.items())
    else: df_cols = db_cols = tuple(fields)
    
    query = select(t_name,  ('id', ) + db_cols)
    data = PgAPI.execute_dict_query(query).all()
    db_data = pd.DataFrame(data, columns=('id', ) + db_cols)
    df_data = df[list(df_cols)]
    
    join_ids = df_data.merge(
        right=db_data, how='left',
        left_on=df_cols, right_on=db_cols
    )['id']
    
    to_return = df.drop(list(df_cols), axis=1)
    to_return[out] = join_ids
    to_return = to_return.reset_index(drop=True)
    return to_return

## Fill database

In [20]:
#clear_db()
init_db()
init_dates()

## Fill types tables

### Fill fact types

In [21]:
fact_types = ('oscar', 'movie', 'ranking')
values_string = ('(%s),' * len(fact_types))[:-1]
PgAPI.execute_call('INSERT INTO fact_types (type_name) VALUES ' + values_string, *fact_types)

### Fill ranking types

In [22]:
ranking_types = ('imdb', 'metacritic', 'rotten_tomatoes')
values_string = ('(%s),' * len(ranking_types))[:-1]
PgAPI.execute_call('INSERT INTO ranking_types (type_name) VALUES ' + values_string, *ranking_types)

## Mapping between dataframes and tables

### Base class

In [23]:
class Table:
    def __init__(self, df: pd.DataFrame, fields: list[str], 
                 fact_type: str) -> None:
        self.df = df[fields]
        self.fact_type = fact_type
       
    def rename(self, mapping: dict[str, str]) -> None:
        self.df = self.df.rename(columns=mapping)
    
    def dropna(self, columns: list[str]) -> None:
        self.df = self.df.dropna(subset=columns)
            
    def load_foreign(self, mapping: dict[str, dict[str, str]]) -> None:            
        for t_name in mapping:
            
            params = mapping[t_name]
            if not isinstance(params[1], list):
                params = [params]

            for param in params:
                fields, id_name = param
                insert_unique(t_name, self.df, fields)
                self.df = join(t_name, self.df, fields, id_name)
    
    def __repr__(self) -> str:
        return repr(self.df)
    
    def load(self) -> None:
        pass
    
    def load_fact(self) -> None:
        fact = Fact(self.df, self.fact_type)
        insert('facts', fact.df)

In [24]:
class Fact(Table):
    def __init__(self, df: pd.DataFrame, f_type: str) -> None:
        fields = list(df.columns)
        super().__init__(df, fields, None)
        self.df['type_name'] = f_type
        self.load_foreign({
            'fact_types': [['type_name'], 'type_id']
        })

In [25]:
class Movie(Table):
    def __init__(self, df: pd.DataFrame) -> None:
        fields = [el for el in df.columns if el not in [
            'title', 'imdb_title_id', 
            'metascore', 'year', 'actors'
        ]]
        super().__init__(df, fields, 'movie')
        self.rename({
            'original_title': 'name',
            'reviews_from_critics': 'critics_votes',
            'reviews_from_users': 'users_votes',
            'usa_gross_income': 'usa_income',
            'worlwide_gross_income': 'worldwide_income'
        })
        self._parse_dates()
        self._parse_currencies()
        self.dropna([
            'director', 'language', 'production_company',
            'writer', 'genre', 'year', 'country', 'name'
        ])
        self.df['type_name'] = 'imdb'
    
    def _parse_currencies(self) -> None:
        for column in ['usa_income', 'worldwide_income', 'budget']:
            self.df[column] = self.df[column]\
                .apply(extract_number)\
                .astype(float)
        
    def _parse_dates(self) -> None:
        date_parts = extract_date(self.df['date_published'])
        self.df = self.df.reset_index(drop=True)
        self.df = pd.concat([self.df, date_parts], axis=1)
        self.df = self.df.drop(['date_published'], axis=1)
        
    def load(self) -> None:
        with open('configs/movies.json') as fhand:
            data = json.load(fhand)
            self.load_foreign(data)

In [30]:
class Oscar(Table):
    def __init__(self, df: pd.DataFrame) -> None:
        fields = list(df.columns)
        super().__init__(df, fields, 'oscar')
        self.rename({
            'ceremony': 'ceremony_number',
            'winner': 'is_winner'
        })
        self.dropna(['film', 'category', 'name'])
    
    def load(self) -> None:
        with open('configs/oscars.json') as fhand:
            data = json.load(fhand)
            self.load_foreign(data)
        self.rename({'date_film_id': 'date_id'})

In [27]:
class Ranking(Table):
    def __init__(self, df: pd.DataFrame) -> None:
        fields = ['FILM', 'RottenTomatoes', 'RottenTomatoes_User',
         'Metacritic', 'Metacritic_User', 'IMDB',
         'Metacritic_user_vote_count', 'IMDB_user_vote_count']
        super().__init__(df, fields)
        self.fact_type = 'ranking'
        self.rename(get_rename_mapping(self.df.columns))   

In [28]:
movie = Movie(movies)
movie.load()

In [31]:
oscar = Oscar(oscars)
oscar.load()

In [32]:
movie.load_fact()

In [33]:
oscar.load_fact()