# 20-sentence-embedding for review, description and overview

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
import umap

## Read in main.csv

In [2]:
main = pd.read_csv('Data/main_211112.csv', dtype=str)

In [3]:
main

Unnamed: 0,id,review,review_1st,review_2nd,original_title,year,date_published,duration,country,language,...,recommendation3,recommendation4,recommendation5,recommendation6,recommendation7,recommendation8,recommendation9,recommendation10,recommendation11,recommendation12
0,tt0018515,This is a very strange film that was long thou...,2.6453724,5.547905,Two Arabian Knights,1927,1927-09-23,92,USA,English,...,The Last Command,Street Angel,Sadie Thompson,Underworld,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade
1,tt0018515,"William Boyd and Louis Wolheim are the ""Two Ar...",-0.4441953,2.2827504,Two Arabian Knights,1927,1927-09-23,92,USA,English,...,The Last Command,Street Angel,Sadie Thompson,Underworld,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade
2,tt0018515,Not very many movies come to my mind that cove...,0.8150438,3.9238899,Two Arabian Knights,1927,1927-09-23,92,USA,English,...,The Last Command,Street Angel,Sadie Thompson,Underworld,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade
3,tt0018515,"The third movie produced by Howard Hughes, thi...",2.477075,5.466526,Two Arabian Knights,1927,1927-09-23,92,USA,English,...,The Last Command,Street Angel,Sadie Thompson,Underworld,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade
4,tt0018515,"Turner Classic Movies showed this silent, B&W ...",1.1922804,2.5448508,Two Arabian Knights,1927,1927-09-23,92,USA,English,...,The Last Command,Street Angel,Sadie Thompson,Underworld,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28623,tt0492486,This was the surprise film at the Dublin Horro...,1.0869972,6.843044,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,...,YellowBrickRoad,Shrooms 3D,Creep,Wicked Little Things,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen
28624,tt0492486,I've seen hundreds of horror movies in my life...,0.17316706,6.62107,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,...,YellowBrickRoad,Shrooms 3D,Creep,Wicked Little Things,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen
28625,tt0492486,I cannot remember the last time a horror movie...,1.0682999,6.835441,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,...,YellowBrickRoad,Shrooms 3D,Creep,Wicked Little Things,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen
28626,tt0492486,I saw this movie tonight. Never have left comm...,1.0295599,3.5069268,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,...,YellowBrickRoad,Shrooms 3D,Creep,Wicked Little Things,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen


## How many unique movies do we have in main dataset

In [4]:
len(set(main['id'].tolist()))

3037

## Convert review and description into list for models

In [5]:
reviews = main["review"].tolist()
reviews[:2]

descriptions = main["description"].tolist()
descriptions[:2]

overviews = main["overview"].tolist()
overviews[:2]

["Director Lewis Milestone's 1927 comedy follows the exploits of two American soldiers during WWI. Starring William Boyd, Mary Astor and Louis Wolheim.",
 "Director Lewis Milestone's 1927 comedy follows the exploits of two American soldiers during WWI. Starring William Boyd, Mary Astor and Louis Wolheim."]

## Download and import model we need

In [6]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')

## Create review embedding by above model

### Combine reviews from the same movie into one

In [7]:
df = main.groupby(by='id').sum()[['review']]
df['id'] = df.index
reviews = df['review'].tolist()
df

Unnamed: 0_level_0,review,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000574,The Story of the Kelly Gang (1906) symbolizes ...,tt0000574
tt0002461,Credited as the earliest complete feature-leng...,tt0002461
tt0005078,"In this short, under an hour feature, all we h...",tt0005078
tt0005339,Watching this today is akin to taking a trip i...,tt0005339
tt0006206,Serials are a low point in film history and th...,tt0006206
...,...,...
tt0988047,It sure took a long time for this film to debu...,tt0988047
tt0988849,Definition of a donkey punch: a blow administe...,tt0988849
tt0990361,I've had the chance to see this movie yesterda...,tt0990361
tt0995740,"Do not, I repeat DO NOT watch this movie if yo...",tt0995740


In [8]:
review_embeddings = model.encode(reviews)
print("Shape:", review_embeddings.shape)
print(review_embeddings)

Shape: (3037, 512)
[[-0.05172382 -0.01318167 -0.02941219 ... -0.08277107 -0.04488661
  -0.0467704 ]
 [ 0.02845402 -0.00476698 -0.0085305  ... -0.07068139  0.00186067
   0.03523669]
 [-0.02477219 -0.04172837 -0.04370242 ... -0.0950805   0.06399228
   0.01111828]
 ...
 [-0.04749458 -0.02303065 -0.02309803 ... -0.04840309 -0.01880701
  -0.00211786]
 [-0.01954935  0.03467304 -0.0288606  ... -0.03560465  0.00657937
   0.05115532]
 [-0.03478554 -0.03051612 -0.01558649 ... -0.07321873 -0.01056405
  -0.00945658]]


## Create description embedding

In [9]:
df_des = main[['id', 'description']].drop_duplicates()
descriptions = df_des['description'].tolist()
df_des

Unnamed: 0,id,description
0,tt0018515,Two American soldiers are captured by the Germ...
10,tt0118866,The relationship between four female temps all...
40,tt0050652,Aging gunslinger Jacob Wade hopes to settle do...
46,tt0051885,A doctor's daughter is kidnapped and buried al...
49,tt0248123,Julietta returns to Berlin with a dilemma on h...
...,...,...
28580,tt0238948,"A man on the run takes another man's passport,..."
28593,tt0406375,Two young brothers are drawn into an intergala...
28597,tt0063591,"A pardoned stagecoach robber, becomes governme..."
28598,tt0119791,"A law student, who takes a job as a night watc..."


In [10]:
description_embeddings = model.encode(descriptions)
print("Shape:", description_embeddings.shape)
print(description_embeddings)

Shape: (3037, 512)
[[ 0.00010171  0.00142653  0.0030459  ...  0.04980651  0.02470978
  -0.01856348]
 [ 0.08609562 -0.02957341  0.08522253 ... -0.04514211 -0.07731085
  -0.01606046]
 [-0.00051633 -0.04036959 -0.01269487 ...  0.04954691 -0.03707771
  -0.00208472]
 ...
 [ 0.0303678   0.02007469 -0.01344747 ... -0.03773912  0.0312012
  -0.0448827 ]
 [ 0.04699449  0.01423166  0.00188398 ...  0.02691923  0.02676632
  -0.03699284]
 [-0.03081206  0.00739098  0.06345712 ...  0.01487389  0.00680741
   0.06922294]]


## Create overview embeddings

In [11]:
df_over = main[['id', 'overview']].drop_duplicates()
overviews = df_over['overview'].tolist()
df_over

Unnamed: 0,id,overview
0,tt0018515,Director Lewis Milestone's 1927 comedy follows...
10,tt0118866,Iris can best be described as a wallflower. Sh...
40,tt0050652,Aging gunslinger Jacob Wade hopes to settle do...
46,tt0051885,A doctor's daughter is kidnapped and buried al...
49,tt0248123,A dramatic teenage love story set against the ...
...,...,...
28580,tt0238948,The governor of a Mexican state is assassinate...
28593,tt0406375,"After their father is called into work, two yo..."
28597,tt0063591,Jesse W. Haywood (Don Knotts) graduates from d...
28598,tt0119791,"A law student, who takes a job as a night watc..."


In [12]:
overview_embeddings = model.encode(overviews)
print("Shape:", overview_embeddings.shape)
print(overview_embeddings)

Shape: (3037, 512)
[[-0.00078443 -0.06351549 -0.13784662 ... -0.04011483 -0.0479288
  -0.05433805]
 [ 0.06199616 -0.00951628 -0.05670635 ...  0.0027988  -0.00995845
   0.02490294]
 [ 0.08615995 -0.03223411 -0.03190124 ... -0.03967644  0.02711902
  -0.02216075]
 ...
 [ 0.00195637  0.06153557  0.01064557 ... -0.00223976  0.05434822
   0.00480348]
 [ 0.04699449  0.01423166  0.00188398 ...  0.02691923  0.02676632
  -0.03699284]
 [-0.00325433 -0.00610787  0.00798221 ... -0.0211592   0.04874625
   0.05028084]]


## Reduce 512 dimension to 2 by using umap technique and plot

In [13]:
reducer = umap.UMAP()
review2D = reducer.fit_transform(review_embeddings)
describe2D = reducer.fit_transform(description_embeddings)
overview2D = reducer.fit_transform(overview_embeddings)
print(review2D.shape)
print(describe2D.shape)
print(overview2D.shape)

(3037, 2)
(3037, 2)
(3037, 2)


### Drop all previous embedding columns

In [14]:
main.columns

Index(['id', 'review', 'review_1st', 'review_2nd', 'original_title', 'year',
       'date_published', 'duration', 'country', 'language', 'director',
       'writer', 'production_company', 'actors', 'description',
       'description_1st', 'description_2nd', 'votes', 'reviews_from_users',
       'reviews_from_critics', 'adult', 'genres', 'original_language',
       'overview', 'popularity', 'poster_path', 'revenue', 'runtime', 'status',
       'video', 'vote_count', 'genre_1', 'genre_2', 'genre_3',
       'recommendation1', 'recommendation2', 'recommendation3',
       'recommendation4', 'recommendation5', 'recommendation6',
       'recommendation7', 'recommendation8', 'recommendation9',
       'recommendation10', 'recommendation11', 'recommendation12'],
      dtype='object')

In [15]:
main_drop = main.drop(columns=['review', 'review_1st', 'review_2nd', 'description', 'description_1st', 'description_2nd'])

## Add review 2d embeddings to main dataset

In [16]:
df['review_1st']=review2D[:,0]
df['review_2nd']=review2D[:,1]
df_re = df.drop(columns='review')
df_re.reset_index(drop=True, inplace=True)
df_re

Unnamed: 0,id,review_1st,review_2nd
0,tt0000574,2.970235,10.711953
1,tt0002461,2.900796,11.341978
2,tt0005078,0.627618,12.805361
3,tt0005339,0.984023,11.585050
4,tt0006206,6.032124,9.722160
...,...,...,...
3032,tt0988047,2.914837,13.828070
3033,tt0988849,5.236000,12.443265
3034,tt0990361,2.541798,12.849465
3035,tt0995740,3.031713,14.914072


In [17]:
main_rev = main_drop.merge(df_re, how='left', on = 'id')

## Add description 2d embeddings to main dataset

In [18]:
df_des['description_1st'] = describe2D[:, 0]
df_des['description_2nd'] = describe2D[:, 1]
df_des = df_des.drop(columns='description')
df_des.reset_index(drop=True, inplace=True)
df_des

Unnamed: 0,id,description_1st,description_2nd
0,tt0018515,17.669596,-0.738235
1,tt0118866,20.341370,-4.583504
2,tt0050652,20.424614,-2.285807
3,tt0051885,21.787266,-3.158614
4,tt0248123,19.101181,-5.684755
...,...,...,...
3032,tt0238948,20.537291,-1.603543
3033,tt0406375,20.091591,-2.168772
3034,tt0063591,21.206562,-0.786693
3035,tt0119791,22.496815,-2.224130


In [19]:
main_rev_des = main_rev.merge(df_des, how='left', on = 'id')
main_rev_des

Unnamed: 0,id,original_title,year,date_published,duration,country,language,director,writer,production_company,...,recommendation7,recommendation8,recommendation9,recommendation10,recommendation11,recommendation12,review_1st,review_2nd,description_1st,description_2nd
0,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235
1,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235
2,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235
3,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235
4,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,Tempest,The Way of All Flesh,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28623,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298
28624,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298
28625,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298
28626,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,Catacombs,Dark Ride,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298


## Add overview 2D embeddings to main dataset

In [20]:
df_over['overview_1st'] = overview2D[:, 0]
df_over['overview_2nd'] = overview2D[:, 1]
df_over = df_over.drop(columns='overview')
df_over.reset_index(drop=True, inplace=True)
df_over

Unnamed: 0,id,overview_1st,overview_2nd
0,tt0018515,8.649476,3.307126
1,tt0118866,12.137049,4.244494
2,tt0050652,9.784697,2.606914
3,tt0051885,11.626658,0.298345
4,tt0248123,10.454543,5.274888
...,...,...,...
3032,tt0238948,10.153371,1.411950
3033,tt0406375,8.163801,0.694519
3034,tt0063591,11.498743,3.363511
3035,tt0119791,11.200775,-0.287683


In [21]:
main_rev_des_over = main_rev_des.merge(df_over, how='left', on = 'id')
main_rev_des_over

Unnamed: 0,id,original_title,year,date_published,duration,country,language,director,writer,production_company,...,recommendation9,recommendation10,recommendation11,recommendation12,review_1st,review_2nd,description_1st,description_2nd,overview_1st,overview_2nd
0,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235,8.649476,3.307126
1,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235,8.649476,3.307126
2,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235,8.649476,3.307126
3,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235,8.649476,3.307126
4,tt0018515,Two Arabian Knights,1927,1927-09-23,92,USA,English,Lewis Milestone,"Wallace Smith, Cyril Gardner",The Caddo Company,...,The Front Page,The Divine Lady,The Patent Leather Kid,The Love Parade,4.942679,13.482060,17.669596,-0.738235,8.649476,3.307126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28623,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298,9.588804,0.534126
28624,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298,9.588804,0.534126
28625,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298,9.588804,0.534126
28626,tt0492486,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",English,Paddy Breathnach,Pearse Elliott,Capitol Films,...,100 Feet,La bête à l'affût,Finders Keepers,Shin Tange Sazen,2.569802,12.533641,20.324760,-0.633298,9.588804,0.534126


## Select needed columns

In [22]:
main_rev_des_over.columns

Index(['id', 'original_title', 'year', 'date_published', 'duration', 'country',
       'language', 'director', 'writer', 'production_company', 'actors',
       'votes', 'reviews_from_users', 'reviews_from_critics', 'adult',
       'genres', 'original_language', 'overview', 'popularity', 'poster_path',
       'revenue', 'runtime', 'status', 'video', 'vote_count', 'genre_1',
       'genre_2', 'genre_3', 'recommendation1', 'recommendation2',
       'recommendation3', 'recommendation4', 'recommendation5',
       'recommendation6', 'recommendation7', 'recommendation8',
       'recommendation9', 'recommendation10', 'recommendation11',
       'recommendation12', 'review_1st', 'review_2nd', 'description_1st',
       'description_2nd', 'overview_1st', 'overview_2nd'],
      dtype='object')

In [23]:
final_main = main_rev_des_over[['id', 'review_1st', 'review_2nd', 
                         'description_1st', 'description_2nd', 
                         'overview_1st', 'overview_2nd', 
                         'country', 'language', 'director', 'votes', 
                         'reviews_from_critics', 'popularity', 
                         'genre_1', 'genre_2', 'genre_3']]

In [24]:
movie_only = final_main.drop_duplicates()
movie_only.to_csv("Data/final_main.csv", index=False)
movie_only

Unnamed: 0,id,review_1st,review_2nd,description_1st,description_2nd,overview_1st,overview_2nd,country,language,director,votes,reviews_from_critics,popularity,genre_1,genre_2,genre_3
0,tt0018515,4.942679,13.482060,17.669596,-0.738235,8.649476,3.307126,USA,English,Lewis Milestone,3.35,6.0,0.38981,Adventure,Comedy,Romance
10,tt0118866,1.672779,11.368067,20.341370,-4.583504,12.137049,4.244494,"UK, USA",English,Jill Sprecher,6.05,43.0,1.885671,Comedy,Drama,
40,tt0050652,3.586681,10.927030,20.424614,-2.285807,9.784697,2.606914,USA,English,Henry Levin,6.25,8.0,0.127452,Drama,Western,
46,tt0051885,5.692213,10.103884,21.787266,-3.158614,11.626658,0.298345,USA,English,William Castle,5.949999999999999,22.0,0.837849,Horror,Thriller,
49,tt0248123,5.052363,14.008677,19.101181,-5.684755,10.454543,5.274888,Germany,German,Christoph Stark,6.6,7.0,0.041677,Drama,Romance,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28580,tt0238948,3.672903,13.673236,20.537291,-1.603543,10.153371,1.411950,USA,"English, Spanish",George Gallo,5.5,46.0,3.067916,Action,Comedy,Crime
28593,tt0406375,2.739948,14.026184,20.091591,-2.168772,8.163801,0.694519,USA,English,Jon Favreau,6.15,132.0,10.695639,Action,Adventure,Comedy
28597,tt0063591,3.020100,9.555977,21.206562,-0.786693,11.498743,3.363511,USA,English,Alan Rafkin,6.300000000000001,8.0,1.053244,Action,Adventure,Comedy
28598,tt0119791,4.831090,9.789382,22.496815,-2.224130,11.200775,-0.287683,USA,English,Ole Bornedal,6.2,52.0,6.122067,Drama,Horror,Thriller


## Merge with imdb_recommendation_tid.csv file

In [25]:
imdb = pd.read_csv('Data/imdb_recommendations_tid.csv', dtype=str)
imdb

Unnamed: 0,id,recommendation1,recommendation2,recommendation3,recommendation4,recommendation5,recommendation6,recommendation7,recommendation8,recommendation9,recommendation10,recommendation11,recommendation12
0,tt0018515,tt0019304,tt0019071,tt0018379,tt0019429,tt0018526,tt0019451,tt0019344,tt0019553,tt0021890,tt0019824,tt0018054,tt0018253
1,tt0118866,tt0116041,tt0114095,tt0119324,tt3314958,tt1512240,tt0268690,tt0443698,tt0067350,tt0090037,tt5791216,tt0082220,tt0068718
2,tt0050652,tt0052877,tt0044683,tt0048707,tt0058286,tt0043079,tt0051046,tt0051849,tt0043276,tt0051496,tt0042426,tt0051848,tt0051400
3,tt0051885,tt0047348,tt0054988,tt0060228,tt0059821,tt0053931,tt0055200,tt0052602,tt0053363,tt0048190,tt0062909,tt0023790,tt0053719
4,tt0248123,tt4326444,tt13279528,tt0072852,tt0433770,tt0045942,tt0402022,tt4048272,tt9271164,tt0185125,tt15392100,tt11242218,tt9203694
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3032,tt0238948,tt0166195,tt4191580,tt0199290,tt0119664,tt0101252,tt0241760,tt0297181,tt0472126,tt5304664,tt0410097,tt0421045,tt0119807
3033,tt0406375,tt0398808,tt0416236,tt0113497,tt0405325,tt1155076,tt0373051,tt0963966,tt1078912,tt0477347,tt0338348,tt0494238,tt0396752
3034,tt0063591,tt0067219,tt0064606,tt0081376,tt0077698,tt0052005,tt0079754,tt0045589,tt0091846,tt0047197,tt0067809,tt0043767,tt0065051
3035,tt0119791,tt0110631,tt0119535,tt0120662,tt4287464,tt0111149,tt0108473,tt0403358,tt0145531,tt0119675,tt0070444,tt0115744,tt0104549


In [28]:
model_data = movie_only.merge(imdb, how='left', on = 'id').drop(columns=['recommendation6','recommendation7',
                                                           'recommendation8', 'recommendation9',
                                                           'recommendation10', 'recommendation11',
                                                           'recommendation12'])
model_data.to_csv("Data/model_data.csv", index=False)

In [29]:
pd.read_csv('Data/model_data.csv')

Unnamed: 0,id,review_1st,review_2nd,description_1st,description_2nd,overview_1st,overview_2nd,country,language,director,...,reviews_from_critics,popularity,genre_1,genre_2,genre_3,recommendation1,recommendation2,recommendation3,recommendation4,recommendation5
0,tt0018515,4.942679,13.482060,17.669596,-0.738235,8.649476,3.307126,USA,English,Lewis Milestone,...,6.0,0.389810,Adventure,Comedy,Romance,tt0019304,tt0019071,tt0018379,tt0019429,tt0018526
1,tt0118866,1.672779,11.368067,20.341370,-4.583504,12.137049,4.244494,"UK, USA",English,Jill Sprecher,...,43.0,1.885671,Comedy,Drama,,tt0116041,tt0114095,tt0119324,tt3314958,tt1512240
2,tt0050652,3.586681,10.927030,20.424614,-2.285807,9.784697,2.606914,USA,English,Henry Levin,...,8.0,0.127452,Drama,Western,,tt0052877,tt0044683,tt0048707,tt0058286,tt0043079
3,tt0051885,5.692213,10.103884,21.787266,-3.158614,11.626658,0.298345,USA,English,William Castle,...,22.0,0.837849,Horror,Thriller,,tt0047348,tt0054988,tt0060228,tt0059821,tt0053931
4,tt0248123,5.052363,14.008677,19.101181,-5.684755,10.454543,5.274888,Germany,German,Christoph Stark,...,7.0,0.041677,Drama,Romance,,tt4326444,tt13279528,tt0072852,tt0433770,tt0045942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3032,tt0238948,3.672903,13.673236,20.537290,-1.603543,10.153371,1.411950,USA,"English, Spanish",George Gallo,...,46.0,3.067916,Action,Comedy,Crime,tt0166195,tt4191580,tt0199290,tt0119664,tt0101252
3033,tt0406375,2.739948,14.026184,20.091590,-2.168772,8.163801,0.694519,USA,English,Jon Favreau,...,132.0,10.695639,Action,Adventure,Comedy,tt0398808,tt0416236,tt0113497,tt0405325,tt1155076
3034,tt0063591,3.020100,9.555977,21.206562,-0.786693,11.498743,3.363511,USA,English,Alan Rafkin,...,8.0,1.053244,Action,Adventure,Comedy,tt0067219,tt0064606,tt0081376,tt0077698,tt0052005
3035,tt0119791,4.831091,9.789382,22.496815,-2.224130,11.200775,-0.287683,USA,English,Ole Bornedal,...,52.0,6.122067,Drama,Horror,Thriller,tt0110631,tt0119535,tt0120662,tt4287464,tt0111149
