In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from secret import username, password

In [2]:
# Study data files
imdb_movies_filepath = "data/IMDb movies.csv"
imdb_names_filepath = "data/IMDb names.csv"
imdb_ratings_filepath = "data/IMDb ratings.csv"

In [3]:
# Read the data into Pandas DataFrame
imdb_movies = pd.read_csv(imdb_movies_filepath)
imdb_names = pd.read_csv(imdb_names_filepath)
imdb_ratings = pd.read_csv(imdb_ratings_filepath)

# PERSON FILE

In [4]:
imdb_names.head()

Unnamed: 0,imdb_name_id,name,birth_name,height,bio,birth_details,birth_year,date_of_birth,place_of_birth,death_details,death_year,date_of_death,place_of_death,reason_of_death,spouses,divorces,spouses_with_children,children,primary_profession,known_for_titles
0,nm0000001,Fred Astaire,Frederic Austerlitz Jr.,177.0,"Fred Astaire was born in Omaha, Nebraska, to J...","May 10, 1899 in Omaha, Nebraska, USA",1899.0,1899-05-10,"Omaha, Nebraska, USA","June 22, 1987 in Los Angeles, California, USA ...",1987.0,1987-06-22,"Los Angeles, California, USA",pneumonia,2,0,1,2,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0072308,tt0043044"
1,nm0000002,Lauren Bacall,Betty Joan Perske,174.0,Lauren Bacall was born Betty Joan Perske on Se...,"September 16, 1924 in The Bronx, New York City...",1924.0,1924-09-16,"The Bronx, New York City, New York, USA","August 12, 2014 in New York City, New York, US...",2014.0,2014-08-12,"New York City, New York, USA",stroke,2,1,2,3,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,Brigitte Bardot,166.0,"Brigitte Bardot was born on September 28, 1934...","September 28, 1934 in Paris, France",1934.0,1934-09-28,"Paris, France",,,,,,4,3,1,1,"actress,soundtrack,producer","tt0054452,tt0059956,tt0057345,tt0049189"
3,nm0000004,John Belushi,John Adam Belushi,173.0,"John Belushi was born in Chicago, Illinois, US...","January 24, 1949 in Chicago, Illinois, USA",1949.0,1949-01-24,"Chicago, Illinois, USA","March 5, 1982 in Hollywood, Los Angeles, Calif...",1982.0,1982-03-05,"Hollywood, Los Angeles, California, USA",acute cocaine and heroin intoxication,1,0,0,0,"actor,writer,soundtrack","tt0078723,tt0072562,tt0080455,tt0077975"
4,nm0000005,Ingmar Bergman,Ernst Ingmar Bergman,179.0,"Ernst Ingmar Bergman was born July 14, 1918, t...","July 14, 1918 in Uppsala, Uppsala län, Sweden",1918.0,1918-07-14,"Uppsala, Uppsala län, Sweden","July 30, 2007 in Fårö, Gotlands län, Sweden",2007.0,2007-07-30,"Fårö, Gotlands län, Sweden",,5,4,5,8,"writer,director,actor","tt0050976,tt0083922,tt0069467,tt0050986"


## person table

In [5]:
name_columns = ['imdb_name_id', 'name', 'birth_name', 'birth_year', 'death_year', 'height']
new_name_df = imdb_names[name_columns].copy()
new_name_df.head()

Unnamed: 0,imdb_name_id,name,birth_name,birth_year,death_year,height
0,nm0000001,Fred Astaire,Frederic Austerlitz Jr.,1899.0,1987.0,177.0
1,nm0000002,Lauren Bacall,Betty Joan Perske,1924.0,2014.0,174.0
2,nm0000003,Brigitte Bardot,Brigitte Bardot,1934.0,,166.0
3,nm0000004,John Belushi,John Adam Belushi,1949.0,1982.0,173.0
4,nm0000005,Ingmar Bergman,Ernst Ingmar Bergman,1918.0,2007.0,179.0


In [6]:
# Rename the column headers
transform_name_df = new_name_df.rename(columns={'imdb_name_id': 'person_id',
                                                 'name': 'person_name'})

# Clean the data by dropping duplicates and setting the index
transform_name_df['unique_id'] = transform_name_df.birth_name.astype(str) + '_' + transform_name_df.birth_year.astype(str)

transform_name_df.drop_duplicates('unique_id', inplace=True)

transform_name_df

Unnamed: 0,person_id,person_name,birth_name,birth_year,death_year,height,unique_id
0,nm0000001,Fred Astaire,Frederic Austerlitz Jr.,1899.0,1987.0,177.0,Frederic Austerlitz Jr._1899.0
1,nm0000002,Lauren Bacall,Betty Joan Perske,1924.0,2014.0,174.0,Betty Joan Perske_1924.0
2,nm0000003,Brigitte Bardot,Brigitte Bardot,1934.0,,166.0,Brigitte Bardot_1934.0
3,nm0000004,John Belushi,John Adam Belushi,1949.0,1982.0,173.0,John Adam Belushi_1949.0
4,nm0000005,Ingmar Bergman,Ernst Ingmar Bergman,1918.0,2007.0,179.0,Ernst Ingmar Bergman_1918.0
...,...,...,...,...,...,...,...
175710,nm9991131,Charoen Kaithitisuwan,Charoen Kaithitisuwan,,,,Charoen Kaithitisuwan_nan
175711,nm9991653,Wojciech Rzehak,Wojciech Rzehak,,,,Wojciech Rzehak_nan
175712,nm9992720,McMagic Cardenas,McMagic Cardenas,,,,McMagic Cardenas_nan
175713,nm9992840,Rashaduzzman Shohag,Rashaduzzman Shohag,,,,Rashaduzzman Shohag_nan


In [7]:
transform_name_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 174580 entries, 0 to 175714
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   person_id    174580 non-null  object 
 1   person_name  174580 non-null  object 
 2   birth_name   174580 non-null  object 
 3   birth_year   75097 non-null   float64
 4   death_year   26832 non-null   float64
 5   height       30020 non-null   float64
 6   unique_id    174580 non-null  object 
dtypes: float64(3), object(4)
memory usage: 10.7+ MB


## job_title table

In [8]:
#Create lists for job title id and job title
job_title_id = ['jt001', 'jt002', 'jt003']
job_title = ['actor', 'director', 'writer']

#Convert list to DataFrame
job_title_df = pd.DataFrame(list(zip(job_title_id, job_title)), 
               columns =['job_title_id', 'job_title'])
job_title_df.head()

Unnamed: 0,job_title_id,job_title
0,jt001,actor
1,jt002,director
2,jt003,writer


# MOVIES FILE

In [9]:
imdb_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [10]:
director_df = imdb_movies[['imdb_title_id', 'director']].copy()
director_df.head()

Unnamed: 0,imdb_title_id,director
0,tt0000574,Charles Tait
1,tt0001892,Urban Gad
2,tt0002101,Charles L. Gaskill
3,tt0002130,"Francesco Bertolini, Adolfo Padovan"
4,tt0002199,Sidney Olcott


In [11]:
# Rename the column headers
transform_director_df = director_df.rename(columns={'imdb_title_id': 'movie_id',
                                                     'director': 'person_name'})

transform_director_df.head()

Unnamed: 0,movie_id,person_name
0,tt0000574,Charles Tait
1,tt0001892,Urban Gad
2,tt0002101,Charles L. Gaskill
3,tt0002130,"Francesco Bertolini, Adolfo Padovan"
4,tt0002199,Sidney Olcott


In [12]:
newdf = transform_director_df.assign(person_name=transform_director_df.person_name.str.split(', ')).explode('person_name')
newdf['job_title_id'] ='jt002'
newdf

Unnamed: 0,movie_id,person_name,job_title_id
0,tt0000574,Charles Tait,jt002
1,tt0001892,Urban Gad,jt002
2,tt0002101,Charles L. Gaskill,jt002
3,tt0002130,Francesco Bertolini,jt002
3,tt0002130,Adolfo Padovan,jt002
...,...,...,...
81268,tt9903716,Aswani Kumar V.,jt002
81269,tt9905412,Zam,jt002
81270,tt9905462,T.V. Chandran,jt002
81271,tt9911774,Vineesh Aaradya,jt002


In [13]:
newdf['person_id'] = newdf['person_name'].map(transform_name_df.set_index('person_name')[unique_id'])
newdf.head()

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
newdf['no_id'] = newdf['person_id'].isnull()
newdf.head()

In [None]:
person_id = newdf[newdf['no_id'] == False]
no_person_id = newdf[newdf['no_id'] == True]

In [None]:
person_id.info()

In [None]:
no_person_id.info()

In [None]:
transform_name_df.tail()

### Create database connection

In [None]:
# connection_string = f'{username}:{password}@localhost:5432/customer_db'
# engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Confirm tables
# engine.table_names()

### Load DataFrames into database

In [None]:
# transform_movie_df.to_sql(name='movie', con=engine, if_exists='append', index=True)