In [None]:
"""Find id and title of films without a category"""

In [None]:
"""SQL table creation and data insertion"""
drop table if exists  film ;
drop table if exists  category  ; 
drop table if exists  film_category  ; 

CREATE TABLE IF NOT EXISTS film (
  film_id smallint(5)  NOT NULL DEFAULT '0',
  title varchar(255) NOT NULL,
  description text,
  PRIMARY KEY (film_id));

CREATE TABLE category  (
   category_id  tinyint(3)  NOT NULL ,
   name  varchar(25) NOT NULL, `last_update` timestamp,
  PRIMARY KEY ( category_id ));

CREATE TABLE film_category  (
   film_id  smallint(5)  NOT NULL,
   category_id  tinyint(3)  NOT NULL, `last_update` timestamp);

INSERT INTO film VALUES(1,'ACADEMY DINOSAUR','A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies');
INSERT INTO film VALUES(2,'ACE GOLDFINGER','A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China');
INSERT INTO film VALUES(3,'ADAPTATION HOLES','A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory');

INSERT INTO category VALUES(1,'Action','2006-02-14 20:46:27');
INSERT INTO category VALUES(2,'Animation','2006-02-14 20:46:27');
INSERT INTO category VALUES(3,'Children','2006-02-14 20:46:27');
INSERT INTO category VALUES(4,'Classics','2006-02-14 20:46:27');
INSERT INTO category VALUES(5,'Comedy','2006-02-14 20:46:27');
INSERT INTO category VALUES(6,'Documentary','2006-02-14 20:46:27');
INSERT INTO category VALUES(7,'Drama','2006-02-14 20:46:27');
INSERT INTO category VALUES(8,'Family','2006-02-14 20:46:27');
INSERT INTO category VALUES(9,'Foreign','2006-02-14 20:46:27');
INSERT INTO category VALUES(10,'Games','2006-02-14 20:46:27');
INSERT INTO category VALUES(11,'Horror','2006-02-14 20:46:27');

INSERT INTO film_category VALUES(1,6,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(2,11,'2006-02-14 21:07:09');

In [None]:
"""SQL solution"""
# 1
SELECT f.film_id, f.title
FROM film AS f
LEFT JOIN film_category AS fc
ON f.film_id = fc.film_id
WHERE fc.category_id IS NULL

# 2
SELECT film_id, title
FROM film
WHERE film_id NOT IN (SELECT film_id FROM film_category)

In [1]:
"""pandas dataframe creation"""
import numpy as np
import pandas as pd

tf = np.array([[1,'ACADEMY DINOSAUR','A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies'],
               [2,'ACE GOLDFINGER','A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China'],
               [3,'ADAPTATION HOLES','A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory']])
tfc = np.array([[1,6,'2006-02-14 21:07:09'],
                [2,11,'2006-02-14 21:07:09']])
film = pd.DataFrame(data=tf, columns=['film_id', 'title', 'description'])
film_category = pd.DataFrame(data=tfc, columns=['film_id', 'category_id', 'last_update'])
film.head()

Unnamed: 0,film_id,title,description
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...


In [10]:
film_category['category_id'] = film_category['category_id'].astype('int')
film_category.head()

Unnamed: 0,film_id,category_id,last_update
0,1,6,2006-02-14 21:07:09
1,2,11,2006-02-14 21:07:09


In [8]:
film_category.dtypes

film_id        object
category_id    object
last_update    object
dtype: object

In [3]:
# in order to process a large table
# copy and paaste the SQL code for table creation and data insertion into data.txt
category = pd.read_csv('data.txt', sep=",", header=None)
category.columns = ['category_id', 'name', 'last_update']
for i in range(len(category)):
    category.iloc[i, 0] = category.iloc[i, 0][28:]
    category.iloc[i, 1] = category.iloc[i, 1][1:-1]
    category.iloc[i, -1] = category.iloc[i, -1][1:-3]
category['category_id'] = category['category_id'].astype('int')
category.head()

Unnamed: 0,category_id,name,last_update
0,1,Action,2006-02-14 20:46:27
1,2,Animation,2006-02-14 20:46:27
2,3,Children,2006-02-14 20:46:27
3,4,Classics,2006-02-14 20:46:27
4,5,Comedy,2006-02-14 20:46:27


In [27]:
"""
To detect NaN:
np.isnan() for numpy array
.isna() or .isnull() for pd.dataframe, these two do exactly the same thing
"""
df = pd.merge(film_category[['film_id', 'category_id']], film[['film_id', 'title']], on='film_id', how='outer')
df[df['category_id'].isna()][['film_id', 'title']]

Unnamed: 0,film_id,title
2,3,ADAPTATION HOLES


In [26]:
df[df['category_id'].isnull()][['film_id', 'title']]

Unnamed: 0,film_id,title
2,3,ADAPTATION HOLES
