In [2]:
import pandas as pd
import numpy as np

### An incredibly useful function, apply

In [3]:
names = ['id', 'title', 'year', 'score', 'votes', 'length', 'genres']
data = pd.read_csv('../data/imdb_top_10000.txt', sep='\t', names=names, index_col=0)

In [4]:
data['score'].apply(lambda x: x*10).head()

id
tt0111161    92.0
tt0110912    90.0
tt0137523    88.0
tt0133093    87.0
tt1375666    89.0
Name: score, dtype: float64

In [5]:
data.head()

Unnamed: 0_level_0,title,year,score,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption (1994),1994,9.2,619479,142 mins.,Crime|Drama
tt0110912,Pulp Fiction (1994),1994,9.0,490065,154 mins.,Crime|Thriller
tt0137523,Fight Club (1999),1999,8.8,458173,139 mins.,Drama|Mystery|Thriller
tt0133093,The Matrix (1999),1999,8.7,448114,136 mins.,Action|Adventure|Sci-Fi
tt1375666,Inception (2010),2010,8.9,385149,148 mins.,Action|Adventure|Sci-Fi|Thriller


### Cleaning up our dataset

Split into groups of 2-3 and discuss how would you approach cleaning:

- the title column to remove the year in parentheses since it's redundant
- the length column to remove mins.
- splitting up the genres column (this one is hard) into a column for each genre with True if the movie is that particular genre and False otherwise

Hint: when ideating as a group, it's helpful to use "pseudocode"

Once you're satisfied as a group, implement the ideas individually!

#### Note: if you're stuck, ask me for hints!

In [6]:
# HINT 1
some_string = 'The Shawshank (2014)'
some_string[-6:]

'(2014)'

In [7]:
# HINT 2
some_splitting = 'Crime|Action|Drama'
some_splitting.split('|')

['Crime', 'Action', 'Drama']

In [8]:
# HINT 3
string_split = ['crime', 'drama', 'crime', 'action', 'action']
unique_collection = set(string_split)
print unique_collection

set(['drama', 'action', 'crime'])


In [9]:
# remove the year in parentheses
data.loc[:, 'title'] = data['title'].str[0:-7]

In [10]:
data.head()

Unnamed: 0_level_0,title,year,score,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption,1994,9.2,619479,142 mins.,Crime|Drama
tt0110912,Pulp Fiction,1994,9.0,490065,154 mins.,Crime|Thriller
tt0137523,Fight Club,1999,8.8,458173,139 mins.,Drama|Mystery|Thriller
tt0133093,The Matrix,1999,8.7,448114,136 mins.,Action|Adventure|Sci-Fi
tt1375666,Inception,2010,8.9,385149,148 mins.,Action|Adventure|Sci-Fi|Thriller


In [11]:
# remove the mins. from length
data.loc[:, 'length'] = data['length'].str.split(' mins.').apply(lambda x: x[0])

In [12]:
data.head()

Unnamed: 0_level_0,title,year,score,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption,1994,9.2,619479,142,Crime|Drama
tt0110912,Pulp Fiction,1994,9.0,490065,154,Crime|Thriller
tt0137523,Fight Club,1999,8.8,458173,139,Drama|Mystery|Thriller
tt0133093,The Matrix,1999,8.7,448114,136,Action|Adventure|Sci-Fi
tt1375666,Inception,2010,8.9,385149,148,Action|Adventure|Sci-Fi|Thriller


In [13]:
# change to int
data.loc[:, 'length'] = data['length'].astype('int')

In [29]:
data = data.dropna()

In [30]:
# solving the genres
gen = set()
for row in data['genres']:
    row_genres = []
    row_splits = row.split('|')
    for each in row_splits:
        row_genres.append(each)
    gen.update(row_genres)
gens = sorted(gen)

In [31]:
gens

['Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western']

In [32]:
# make columns
copying = data.copy()
for each_genre in gens:
    copying[each_genre] = data.genres.apply(lambda x: each_genre in x.split('|'))

In [33]:
data.head()

Unnamed: 0_level_0,title,year,score,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption,1994,9.2,619479,142,Crime|Drama
tt0110912,Pulp Fiction,1994,9.0,490065,154,Crime|Thriller
tt0137523,Fight Club,1999,8.8,458173,139,Drama|Mystery|Thriller
tt0133093,The Matrix,1999,8.7,448114,136,Action|Adventure|Sci-Fi
tt1375666,Inception,2010,8.9,385149,148,Action|Adventure|Sci-Fi|Thriller


In [34]:
copying.sample(5)

Unnamed: 0_level_0,title,year,score,votes,length,genres,Action,Adult,Adventure,Animation,...,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0108000,Ruby in Paradise,1993,7.0,1544,114,Drama|Romance,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
tt0109770,The Fantastic Four,1994,3.8,1595,90,Action|Adventure|Sci-Fi|Fantasy,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
tt0051622,The Fly,1958,7.0,6449,94,Horror|Sci-Fi|Thriller,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
tt0428059,Toss-Up,2004,7.7,1976,0,Drama,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt0290334,X2,2003,7.7,130640,133,Action|Adventure|Sci-Fi|Thriller,True,False,True,False,...,False,False,False,False,False,True,False,True,False,False


In [67]:
# sending to a csv file called test
copying.to_csv('test.csv', index=False)