In [1]:
import pandas as pd
import os
from glob import glob

In [2]:
# glob finds all the files in a specified directory that match the condition
# we do this so we can open multiple files at once
csv_files = glob("./zippedData/*.csv.gz")
csv_files

['./zippedData/imdb.title.crew.csv.gz',
 './zippedData/tmdb.movies.csv.gz',
 './zippedData/imdb.title.akas.csv.gz',
 './zippedData/imdb.title.ratings.csv.gz',
 './zippedData/imdb.name.basics.csv.gz',
 './zippedData/imdb.title.basics.csv.gz',
 './zippedData/tn.movie_budgets.csv.gz',
 './zippedData/bom.movie_gross.csv.gz',
 './zippedData/imdb.title.principals.csv.gz']

In [3]:
# tsv files, because later tsv files need to be opened with sep='\t'
tsv_files = glob("./Unused Data/*.tsv.gz")
tsv_files

['./Unused Data/rt.reviews.tsv.gz', './Unused Data/rt.movie_info.tsv.gz']

In [4]:
data_dict = {}

for file in csv_files:
    data_dict[file] = pd.read_csv(file)
    
for file in tsv_files:
    data_dict[file] = pd.read_csv(file, sep='\t', encoding= 'unicode_escape') 

In [5]:
df2 = data_dict['./zippedData/bom.movie_gross.csv.gz']
df2.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [6]:
# To join these two dataframes, I'm thinking that we can do it by joining the domestic/foreign gross.
# The only thing is, in bon.movie_gross has the numbers rounded to the nearest 100k's whereas the 
# tn.movie_budgets is specific to the last dollar. I would have to convert the tn.movie_budgets columns
# to rounded numbers to then join the two dataframes.

In [7]:
df = data_dict['./zippedData/tmdb.movies.csv.gz']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         26517 non-null  int64  
 1   genre_ids          26517 non-null  object 
 2   id                 26517 non-null  int64  
 3   original_language  26517 non-null  object 
 4   original_title     26517 non-null  object 
 5   popularity         26517 non-null  float64
 6   release_date       26517 non-null  object 
 7   title              26517 non-null  object 
 8   vote_average       26517 non-null  float64
 9   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB


In [8]:
df.keys()

Index(['Unnamed: 0', 'genre_ids', 'id', 'original_language', 'original_title',
       'popularity', 'release_date', 'title', 'vote_average', 'vote_count'],
      dtype='object')

In [9]:
type('genre_ids')

str

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [11]:
di = {"genres":[{"id":28,"name":"Action"},{"id":12,"name":"Adventure"},{"id":16,"name":"Animation"},{"id":35,"name":"Comedy"},{"id":80,"name":"Crime"},{"id":99,"name":"Documentary"},{"id":18,"name":"Drama"},{"id":10751,"name":"Family"},{"id":14,"name":"Fantasy"},{"id":36,"name":"History"},{"id":27,"name":"Horror"},{"id":10402,"name":"Music"},{"id":9648,"name":"Mystery"},{"id":10749,"name":"Romance"},{"id":878,"name":"Science Fiction"},{"id":10770,"name":"TV Movie"},{"id":53,"name":"Thriller"},{"id":10752,"name":"War"},{"id":37,"name":"Western"}]}


In [12]:
print(di)

{'genres': [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}, {'id': 99, 'name': 'Documentary'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 14, 'name': 'Fantasy'}, {'id': 36, 'name': 'History'}, {'id': 27, 'name': 'Horror'}, {'id': 10402, 'name': 'Music'}, {'id': 9648, 'name': 'Mystery'}, {'id': 10749, 'name': 'Romance'}, {'id': 878, 'name': 'Science Fiction'}, {'id': 10770, 'name': 'TV Movie'}, {'id': 53, 'name': 'Thriller'}, {'id': 10752, 'name': 'War'}, {'id': 37, 'name': 'Western'}]}


In [13]:
df['genre_ids']

0            [12, 14, 10751]
1        [14, 12, 16, 10751]
2              [12, 28, 878]
3            [16, 35, 10751]
4              [28, 878, 12]
                ...         
26512               [27, 18]
26513               [18, 53]
26514           [14, 28, 12]
26515        [10751, 12, 28]
26516               [53, 27]
Name: genre_ids, Length: 26517, dtype: object

In [14]:
new_dict = {}
for i in range(0, len(di['genres'])):
    new_id = str(di['genres'][i]['id'])
    new_dict[new_id] = di['genres'][i]['name']
new_dict


{'28': 'Action',
 '12': 'Adventure',
 '16': 'Animation',
 '35': 'Comedy',
 '80': 'Crime',
 '99': 'Documentary',
 '18': 'Drama',
 '10751': 'Family',
 '14': 'Fantasy',
 '36': 'History',
 '27': 'Horror',
 '10402': 'Music',
 '9648': 'Mystery',
 '10749': 'Romance',
 '878': 'Science Fiction',
 '10770': 'TV Movie',
 '53': 'Thriller',
 '10752': 'War',
 '37': 'Western'}

In [15]:
y = df[['Unnamed: 0', 'genre_ids']]
y['genre_ids'] = y['genre_ids'].str.strip('[]')
df2 = pd.DataFrame(y['genre_ids'].str.split(', ').values.tolist())
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,0,1,2,3,4,5,6
0,12,14,10751,,,,
1,14,12,16,10751,,,
2,12,28,878,,,,
3,16,35,10751,,,,
4,28,878,12,,,,
...,...,...,...,...,...,...,...
26512,27,18,,,,,
26513,18,53,,,,,
26514,14,28,12,,,,
26515,10751,12,28,,,,


In [16]:
df2[0]=df2[0].map(new_dict)
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,14,10751,,,,
1,Fantasy,12,16,10751,,,
2,Adventure,28,878,,,,
3,Animation,35,10751,,,,
4,Action,878,12,,,,
...,...,...,...,...,...,...,...
26512,Horror,18,,,,,
26513,Drama,53,,,,,
26514,Fantasy,28,12,,,,
26515,Family,12,28,,,,


In [17]:
df2[1]=df2[1].map(new_dict)
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,Fantasy,10751,,,,
1,Fantasy,Adventure,16,10751,,,
2,Adventure,Action,878,,,,
3,Animation,Comedy,10751,,,,
4,Action,Science Fiction,12,,,,
...,...,...,...,...,...,...,...
26512,Horror,Drama,,,,,
26513,Drama,Thriller,,,,,
26514,Fantasy,Action,12,,,,
26515,Family,Adventure,28,,,,


In [18]:
df2[2]=df2[2].map(new_dict)
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,Fantasy,Family,,,,
1,Fantasy,Adventure,Animation,10751,,,
2,Adventure,Action,Science Fiction,,,,
3,Animation,Comedy,Family,,,,
4,Action,Science Fiction,Adventure,,,,
...,...,...,...,...,...,...,...
26512,Horror,Drama,,,,,
26513,Drama,Thriller,,,,,
26514,Fantasy,Action,Adventure,,,,
26515,Family,Adventure,Action,,,,


In [19]:
df2[3]=df2[3].map(new_dict)
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,Fantasy,Family,,,,
1,Fantasy,Adventure,Animation,Family,,,
2,Adventure,Action,Science Fiction,,,,
3,Animation,Comedy,Family,,,,
4,Action,Science Fiction,Adventure,,,,
...,...,...,...,...,...,...,...
26512,Horror,Drama,,,,,
26513,Drama,Thriller,,,,,
26514,Fantasy,Action,Adventure,,,,
26515,Family,Adventure,Action,,,,


In [20]:
df2[4]=df2[4].map(new_dict)
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,Fantasy,Family,,,,
1,Fantasy,Adventure,Animation,Family,,,
2,Adventure,Action,Science Fiction,,,,
3,Animation,Comedy,Family,,,,
4,Action,Science Fiction,Adventure,,,,
...,...,...,...,...,...,...,...
26512,Horror,Drama,,,,,
26513,Drama,Thriller,,,,,
26514,Fantasy,Action,Adventure,,,,
26515,Family,Adventure,Action,,,,


In [21]:
df2[5]=df2[5].map(new_dict)
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,Fantasy,Family,,,,
1,Fantasy,Adventure,Animation,Family,,,
2,Adventure,Action,Science Fiction,,,,
3,Animation,Comedy,Family,,,,
4,Action,Science Fiction,Adventure,,,,
...,...,...,...,...,...,...,...
26512,Horror,Drama,,,,,
26513,Drama,Thriller,,,,,
26514,Fantasy,Action,Adventure,,,,
26515,Family,Adventure,Action,,,,


In [22]:
df2[6]=df2[6].map(new_dict)
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,Fantasy,Family,,,,
1,Fantasy,Adventure,Animation,Family,,,
2,Adventure,Action,Science Fiction,,,,
3,Animation,Comedy,Family,,,,
4,Action,Science Fiction,Adventure,,,,
...,...,...,...,...,...,...,...
26512,Horror,Drama,,,,,
26513,Drama,Thriller,,,,,
26514,Fantasy,Action,Adventure,,,,
26515,Family,Adventure,Action,,,,


In [23]:
df2

Unnamed: 0,0,1,2,3,4,5,6
0,Adventure,Fantasy,Family,,,,
1,Fantasy,Adventure,Animation,Family,,,
2,Adventure,Action,Science Fiction,,,,
3,Animation,Comedy,Family,,,,
4,Action,Science Fiction,Adventure,,,,
...,...,...,...,...,...,...,...
26512,Horror,Drama,,,,,
26513,Drama,Thriller,,,,,
26514,Fantasy,Action,Adventure,,,,
26515,Family,Adventure,Action,,,,


In [24]:
domesticdf = pd.read_csv('domesticdf.csv')

In [25]:
domesticdf

Unnamed: 0,movie,production_budget,domestic_gross,production_profit_domestic
0,Star Wars Ep. VII: The Force Awakens,306000000,936662225,630662225
1,Black Panther,200000000,700059566,500059566
2,Titanic,200000000,659363944,459363944
3,Star Wars Ep. IV: A New Hope,11000000,460998007,449998007
4,Jurassic World,215000000,652270625,437270625
...,...,...,...,...
5777,Battleship,220000000,65233400,-154766600
5778,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,-169536125
5779,The Lone Ranger,275000000,89302115,-185697885
5780,John Carter,275000000,73058679,-201941321


In [43]:
# df.merge(top_int_100, left_on='movie', right_on='original_title')

In [40]:
df.merge(domesticdf, left_on='original_title', right_on='movie')

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,movie,production_budget,domestic_gross,production_profit_domestic
0,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,How to Train Your Dragon,165000000,217581232,52581232
1,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,Iron Man 2,170000000,312433331,142433331
2,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,Toy Story,30000000,191796233,161796233
3,2473,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,Toy Story,30000000,191796233,161796233
4,4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186,Inception,160000000,292576195,132576195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2311,26323,[],509316,en,The Box,0.600,2018-03-04,The Box,8.0,1,The Box,25000000,15051977,-9948023
2312,26425,[10402],509306,en,The Box,0.600,2018-03-04,The Box,6.0,1,The Box,25000000,15051977,-9948023
2313,26092,"[35, 16]",546674,en,Enough,0.719,2018-03-22,Enough,8.7,3,Enough,38000000,39177215,1177215
2314,26322,[],513161,en,Undiscovered,0.600,2018-04-07,Undiscovered,8.0,1,Undiscovered,9000000,1069318,-7930682


In [41]:
df

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...,...
26512,26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [36]:
# Genre Code Breaker-
# '{"genres":[{"id":28,"name":"Action"},{"id":12,"name":"Adventure"},{"id":16,"name":"Animation"},{"id":35,"name":"Comedy"},{"id":80,"name":"Crime"},{"id":99,"name":"Documentary"},{"id":18,"name":"Drama"},{"id":10751,"name":"Family"},{"id":14,"name":"Fantasy"},{"id":36,"name":"History"},{"id":27,"name":"Horror"},{"id":10402,"name":"Music"},{"id":9648,"name":"Mystery"},{"id":10749,"name":"Romance"},{"id":878,"name":"Science Fiction"},{"id":10770,"name":"TV Movie"},{"id":53,"name":"Thriller"},{"id":10752,"name":"War"},{"id":37,"name":"Western"}]}'

In [28]:
df2.to_csv('genres.csv', index=False)