In [1]:
import pandas as pd
import os
import json
import re
from tqdm import tqdm # to show progress bar in for loops

# to increase display width of dataframes in jupyter notebook
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)

In [2]:
root = "movie_dataset/"

In [3]:
os.listdir(root)

['credits.csv',
 'keywords.csv',
 'links.csv',
 'links_small.csv',
 'movies_metadata.csv',
 'ratings.csv',
 'ratings_small.csv']

## credits.csv

In [33]:
credits = pd.read_csv(root+'credits.csv')

In [5]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [6]:
credits.shape
credits.columns

Index(['cast', 'crew', 'id'], dtype='object')

In [7]:
def str_to_json_v2(df, col, row):
    regex = r"\"[a-zA-Z_\"0-9 ]*( )*:( )*(\"[a-zA-Z_\"0-9 ]*\"|[0-9]+)"
    regex2 = r":( )*(\"[a-zA-Z_\"0-9 ]*\"|[0-9]+)"

    crew00= df[col][row].replace("\\","").replace("\'","\"").replace("None","\"\"")
    crew00 = split_with_delimiter(crew00, '}, ', '}')
    crew00[0] = crew00[0][1:]
    crew00[-1] = crew00[-1][:-2]

    all_files = []
    for i in range(len(crew00)):
        try:
            all_files.append(json.loads(crew00[i]))
        except ValueError:
            new_string = "{"
            matches = re.finditer(regex, crew00[i])
            for match in matches:
                key_val = match.group()
                matches2 = re.finditer(regex2, key_val)
                for match2 in matches2:
                    endposn = match2.start()
                    val = match2.group()
                    val = val.replace(":", "").strip()
                    if val[0]=="\"" and val[-1]=="\"":
                        val = val[1:-1].replace("\"", "\'")
                        val = " \"" + val + "\""
                    else:
                        val = " " + val

                    new_string+= key_val[:endposn+1]+val
                new_string+=' ,'

            new_string = new_string[:-2]
            new_string+='}'
            all_files.append(json.loads(new_string))
    
    return all_files

def split_with_delimiter(text, delimiter, repl):

    text = text.split(delimiter)
    text =  [e+repl for e in text if e]
    return text

def run_csv(df, no_rows=2):
    credits_json = []
    for x in tqdm(range(2)):
        tmp_dict = {}
        for y in df.columns:
            #print(y)
            if y  == 'id':
                tmp_dict['id'] = df[y][x]
            else:
                tmp_dict[y] = str_to_json_v2(df,y,x)

        credits_json.append(tmp_dict)
    return credits_json

def json_to_file(obj, fp='test.json', indent=3):
    with open('test.json', 'w') as f:
        json.dump(obj, f, default=str, indent=indent)
    print('file saved',fp)

In [8]:
lol = run_csv(credits)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 501.68it/s]


In [9]:
json_to_file(lol)

file saved test.json


## movies_metadata.csv

In [245]:
movies_meta = pd.read_csv(root+'movies_metadata.csv')

#### from the above inspection, we can see that we don't have any use for the following columns: video

In [246]:
def clean_genres(x):
    '''
    To show only the clear genre names as a list
    '''
    if x == "":
        return x
    else:
        x = x.replace('\'', '\"').replace('None','\"\"')
        x = json.loads(x)
        if type(x) == list:
            return [gen["name"] for gen in x]
        else:
            return x["name"]  

In [247]:
#movies_meta['genres'] = movies_meta['genres'].apply(clean_genres)

In [248]:
def str_to_json_v3(x):
    regex = r"\"[a-zA-Z_\"\/.&\-\\(\)0-9 ]*( )*:( )*(\"[a-zA-Z_\"\/.&\-\(\)\,0-9 ]*(\"( )*,|\"})|[0-9]+)"
    regex2 = r":( )*(\"[a-zA-Z_\"\/.&\-\(\)\,0-9 ]*(\"( )*,|\"})|[0-9]+)"
    try:
        return clean_genres(x)
    except:
        crew00= x.replace("\\","").replace("\'","\"").replace("None","\"\"")

        
        new_string = "{"
        matches = re.finditer(regex, crew00)
        for match in matches:
            key_val = match.group()
            matches2 = re.finditer(regex2, key_val)
            for match2 in matches2:
                endposn = match2.start()
                val = match2.group()
                val = val.replace(":", "").strip()
                if val[0]=="\"" and val[-1]=="\"":
                    val = val[1:-1].replace("\"", "\'")
                    val = " \"" + val + "\""
                else:
                    val = " " + val

                new_string+= key_val[:endposn+1]+val
            new_string+=' ,'

        new_string = new_string[:-2]
        new_string = new_string.replace(", ,",",")
        print(new_string)
        

In [249]:
movies_meta['belongs_to_collection'].fillna("", inplace = True)

In [250]:
str_to_json_v3(movies_meta['belongs_to_collection'][0])

'Toy Story Collection'

In [251]:
movies_meta['belongs_to_collection'] = movies_meta['belongs_to_collection'].apply(str_to_json_v3)

{"id": 118221 ,"name": "Weekend at Bernie"s Collection","poster_path": "/gJVBXVetIkgVVMbzODQ3dJwvQkV.jpg","backdrop_path": "/khVvskKc4VzAHbHtQMhKGeKWDxC.jpg"}
{"id": 393564 ,"name": "McHale"s Navy Collection","poster_path": "","backdrop_path": ""}
{"id": 108693 ,"name": "National Lampoon"s Vacation Collection","poster_path": "/zvEUkLpDO7xlgabBEA9CCc0cvAt.jpg","backdrop_path": "/pBmhj6KkEjlzc3e7jAPkIHgGIrE.jpg"}
{"id": 10455 ,"name": "Child"s Play Collection","poster_path": "/50aqbDvbOtdlZrje6Qk4ZvKM7dM.jpg","backdrop_path": "/AAhYXBVIEl6WgQnzfBsauTIC25.jpg"}
{"id": 10455 ,"name": "Child"s Play Collection","poster_path": "/50aqbDvbOtdlZrje6Qk4ZvKM7dM.jpg","backdrop_path": "/AAhYXBVIEl6WgQnzfBsauTIC25.jpg"}
{"id": 10455 ,"name": "Child"s Play Collection","poster_path": "/50aqbDvbOtdlZrje6Qk4ZvKM7dM.jpg","backdrop_path": "/AAhYXBVIEl6WgQnzfBsauTIC25.jpg"}
{"id": 263193 ,"name": "Charlotte"s Web Collection","poster_path": "/7sg3elvEwetbPtTbMwr9IuegAtD.jpg","backdrop_path": "/1FanC71rFOOJuk

In [76]:
movies_meta['belongs_to_collection'][0] 

In [77]:
temp = (movies_meta['belongs_to_collection'][0])
type(temp)

NoneType

In [46]:
temp = dict(temp)

ValueError: dictionary update sequence element #0 has length 1; 2 is required

### links.csv

In [17]:
links_df = pd.read_csv(os.path.join(root, 'links.csv'))

In [18]:
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
45838,176269,6209470,439050.0
45839,176271,2028550,111109.0
45840,176273,303758,67758.0
45841,176275,8536,227506.0
