In [1]:
import pandas as pd

In [2]:
df_movies = pd.read_csv('IMDb movies.csv', low_memory=False)
df_ratings = pd.read_csv('IMDb ratings.csv')

In [3]:
# select columns
df_movies = df_movies[['imdb_title_id', 'title', 'year',
                       'genre', 'country', 'director', 'actors']]

df_ratings = df_ratings[['imdb_title_id', 'total_votes', 'mean_vote']]

# concat()

## Concatenate vertically

To concatenate vertically (along the rows) we should have columns in common between the 2 dataframes

In [4]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'id': ['E', 'F', 'G', 'F'],
                    'age': [40, 21, 19, 24]})

In [5]:
pd.concat([df1, df2])

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22
0,E,40
1,F,21
2,G,19
3,F,24


### Exercise

In [6]:
# extract a 50% sample of the original dataframe
df_sample = df_movies.sample(frac=0.5)

In [7]:
# shape of dataframes that we'll concatenate
print(df_movies.shape)
print(df_sample.shape)

(85855, 7)
(42928, 7)


In [8]:
# concatenate and df_movies and df_sample (vertically along the rows)
df_concat_vertically = pd.concat([df_movies, df_sample], axis=0)
df_concat_vertically.head()

Unnamed: 0,imdb_title_id,title,year,genre,country,director,actors
0,tt0000009,Miss Jerry,1894,Romance,USA,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D..."
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,Charles Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be..."
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",Urban Gad,"Asta Nielsen, Valdemar Psilander, Gunnar Helse..."
3,tt0002101,Cleopatra,1912,"Drama, History",USA,Charles L. Gaskill,"Helen Gardner, Pearl Sindelar, Miss Fielding, ..."
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,"Francesco Bertolini, Adolfo Padovan","Salvatore Papa, Arturo Pirovano, Giuseppe de L..."


In [9]:
print(df_concat_vertically.shape)

(128783, 7)


## Concatenate horizontally

To concatenate horizontally (along the columns) we should have a common index between the 2 dataframes

In [10]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'job': ['Doctor', 'Statistician',
                            'Accountant', 'Developer']})

In [11]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,id,age,job
0,A,30,Doctor
1,B,23,Statistician
2,C,25,Accountant
3,D,22,Developer


### Exercise

In [12]:
# shape of dataframes that we'll concatenate
print(df_movies.shape)
print(df_ratings.shape)

(85855, 7)
(85855, 3)


In [13]:
# concatenate df_movies and df_ratings on 'imdb_title_id' (horizontally along the columns)
df_concat_horizontally = pd.concat([
    df_movies.set_index('imdb_title_id'), df_ratings.set_index('imdb_title_id')
], axis=1)

In [15]:
print(df_concat_horizontally.shape)

(85855, 8)
