# Preparing Data for SQLite Storage

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv('academy_awards.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2010 (83rd),Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,
1,2010 (83rd),Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,
2,2010 (83rd),Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,
3,2010 (83rd),Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},YES,,,,,,
4,2010 (83rd),Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},NO,,,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10137 entries, 0 to 10136
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Year             10137 non-null  object
 1   Category         10137 non-null  object
 2   Nominee          10137 non-null  object
 3   Additional Info  9011 non-null   object
 4   Won?             10137 non-null  object
 5   Unnamed: 5       11 non-null     object
 6   Unnamed: 6       12 non-null     object
 7   Unnamed: 7       3 non-null      object
 8   Unnamed: 8       2 non-null      object
 9   Unnamed: 9       1 non-null      object
 10  Unnamed: 10      1 non-null      object
dtypes: object(11)
memory usage: 871.3+ KB


1. We have string values in the year column, we should change these into integers.

2. Additional Info has the movie name and the role name, this can be split into two columns.

3. The "Won?" column change should be changed to 0's and 1's in.

4. There are a lot of NaN values under the unnammed columns, we should consider dropping them.

First let's take a look at a unnammed columns, we can use .value_counts() to see if there are any significant values in these columns.

In [4]:
print(df['Unnamed: 10'].value_counts())
print(df['Unnamed: 9'].value_counts())
print(df['Unnamed: 8'].value_counts())
print(df['Unnamed: 7'].value_counts())
print(df['Unnamed: 6'].value_counts())
print(df['Unnamed: 5'].value_counts())

*    1
Name: Unnamed: 10, dtype: int64
*    1
Name: Unnamed: 9, dtype: int64
*                                                 1
 understanding comedy genius - Mack Sennett.""    1
Name: Unnamed: 8, dtype: int64
 while requiring no dangerous solvents. [Systems]"    1
*                                                     1
 kindly                                               1
Name: Unnamed: 7, dtype: int64
*                                                                   9
 flexibility and water resistance                                   1
 direct radiator bass style cinema loudspeaker systems. [Sound]"    1
 sympathetic                                                        1
Name: Unnamed: 6, dtype: int64
*                                                                                                               7
 resilience                                                                                                     1
 D.B. "Don" Keele and Mark E. Engebretson has resu

It doesn't look like there any significant information under these columns. We can probably drop all the unnamed columns, but first let's convert the year column to integers.

In [5]:
df['Year'] = df['Year'].str[0:4].astype(int)
df.head()

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2010,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,
1,2010,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,
2,2010,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,
3,2010,Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},YES,,,,,,
4,2010,Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},NO,,,,,,


In [6]:
df.describe(include = 'all')

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
count,10137.0,10137,10137,9011,10137,11,12,3,2,1,1
unique,,40,6001,6424,16,5,4,3,2,1,1
top,,Writing,Meryl Streep,Metro-Goldwyn-Mayer,NO,*,*,while requiring no dangerous solvents. [Syste...,*,*,*
freq,,888,16,60,7168,7,9,1,1,1,1
mean,1970.330768,,,,,,,,,,
std,23.332917,,,,,,,,,,
min,1927.0,,,,,,,,,,
25%,1950.0,,,,,,,,,,
50%,1970.0,,,,,,,,,,
75%,1991.0,,,,,,,,,,


In [10]:
df['Category'].value_counts()

Writing                                                        888
Music (Scoring)                                                748
Cinematography                                                 572
Art Direction                                                  552
Best Picture                                                   485
Sound                                                          460
Short Film (Live Action)                                       434
Scientific and Technical (Technical Achievement Award)         428
Music (Song)                                                   413
Actress -- Leading Role                                        411
Directing                                                      410
Actor -- Leading Role                                          408
Film Editing                                                   385
Costume Design                                                 384
Actress -- Supporting Role                                    

We are only interested in data after year 2000, and actors in award winnng categories.

In [11]:
later_than_2000 = df[df['Year'] > 2000]

award_categories = [
    "Actor -- Leading Role",
    "Actor -- Supporting Role",
    "Actress -- Leading Role",
    "Actress -- Supporting Role"
]

nominations = later_than_2000[later_than_2000['Category'].isin(award_categories)]
nominations.head()

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2010,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,
1,2010,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,
2,2010,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,
3,2010,Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},YES,,,,,,
4,2010,Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},NO,,,,,,


Convert all the "No" values to 0 and all the "Yes" values to 1 in the 'Won?' column.

In [14]:
yes_no = {
    "NO":0,
    "YES":1
}

nominations['Won'] = nominations['Won?'].map(yes_no)
nominations.head()

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Won
0,2010,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,,0
1,2010,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,,0
2,2010,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,,0
3,2010,Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},YES,,,,,,,1
4,2010,Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},NO,,,,,,,0


Next, we'll drop the columns we don't need.

In [16]:
columns_drop = [
    "Won?",
    "Unnamed: 5",
    "Unnamed: 6",
    "Unnamed: 7",
    "Unnamed: 8",
    "Unnamed: 9",
    "Unnamed: 10"
]
final_nominations = nominations.drop(columns_drop, axis = 1)

In [17]:
final_nominations.head()

Unnamed: 0,Year,Category,Nominee,Additional Info,Won
0,2010,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},0
1,2010,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},0
2,2010,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},0
3,2010,Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI'},1
4,2010,Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston'},0


The last thing we'll have to do is separate the "Additional Info" column into two new columns "Movie" and "Character". We'll have to manipulate the strings and the split the two values in this one column in two new strings.

In [18]:
additional_info_one = final_nominations['Additional Info'].str.rstrip(to_strip = "'}")
additional_info_one.head()

0                        Biutiful {'Uxbal
1             True Grit {'Rooster Cogburn
2    The Social Network {'Mark Zuckerberg
3      The King's Speech {'King George VI
4                127 Hours {'Aron Ralston
Name: Additional Info, dtype: object

In [19]:
additional_info_two = additional_info_one.str.split(' {\'')
additional_info_two.head()

0                        [Biutiful, Uxbal]
1             [True Grit, Rooster Cogburn]
2    [The Social Network, Mark Zuckerberg]
3      [The King's Speech, King George VI]
4                [127 Hours, Aron Ralston]
Name: Additional Info, dtype: object

In [20]:
movie_names = additional_info_two.str[0]
characters = additional_info_two.str[1]

In [21]:
final_nominations["Movie"] = movie_names
final_nominations["Character"] = characters
final_nominations = final_nominations.drop(["Additional Info"], axis=1)
final_nominations.head()

Unnamed: 0,Year,Category,Nominee,Won,Movie,Character
0,2010,Actor -- Leading Role,Javier Bardem,0,Biutiful,Uxbal
1,2010,Actor -- Leading Role,Jeff Bridges,0,True Grit,Rooster Cogburn
2,2010,Actor -- Leading Role,Jesse Eisenberg,0,The Social Network,Mark Zuckerberg
3,2010,Actor -- Leading Role,Colin Firth,1,The King's Speech,King George VI
4,2010,Actor -- Leading Role,James Franco,0,127 Hours,Aron Ralston


# Save Data to SQLite Storage
Now that we are done cleaning up the data, we can do some simply analysis using sqlite3.

In [27]:
import sqlite3
conn = sqlite3.connect("nominations.db") # Creates an empty db called nomination.db
cursor = conn.cursor()

In [29]:
#Creates the table "nominations"
final_nominations.to_sql("nominations", conn, index=False)

In [31]:
q0 = '''
SELECT * FROM sqlite_master;
'''

db_schema = cursor.execute(q0).fetchall()

for item in db_schema:
    print(item,'\n')
    
print(len((db_schema)))

('table', 'nominations', 'nominations', 2, 'CREATE TABLE "nominations" (\n"Year" INTEGER,\n  "Category" TEXT,\n  "Nominee" TEXT,\n  "Won" INTEGER,\n  "Movie" TEXT,\n  "Character" TEXT\n)') 

1


In [32]:
q1 = '''
PRAGMA table_info (nominations);
'''
result = cursor.execute(q1).fetchall()
result

[(0, 'Year', 'INTEGER', 0, None, 0),
 (1, 'Category', 'TEXT', 0, None, 0),
 (2, 'Nominee', 'TEXT', 0, None, 0),
 (3, 'Won', 'INTEGER', 0, None, 0),
 (4, 'Movie', 'TEXT', 0, None, 0),
 (5, 'Character', 'TEXT', 0, None, 0)]

In [33]:
q2 = '''
SELECT * FROM nominations LIMIT 5;
'''
result = cursor.execute(q2).fetchall()
result

[(2010, 'Actor -- Leading Role', 'Javier Bardem', 0, 'Biutiful', 'Uxbal'),
 (2010,
  'Actor -- Leading Role',
  'Jeff Bridges',
  0,
  'True Grit',
  'Rooster Cogburn'),
 (2010,
  'Actor -- Leading Role',
  'Jesse Eisenberg',
  0,
  'The Social Network',
  'Mark Zuckerberg'),
 (2010,
  'Actor -- Leading Role',
  'Colin Firth',
  1,
  "The King's Speech",
  'King George VI'),
 (2010,
  'Actor -- Leading Role',
  'James Franco',
  0,
  '127 Hours',
  'Aron Ralston')]

In [34]:
conn.close()