In [None]:
"""Find the category name of films with 'robot' in their descriptions and the number of films for each category, with total number of films in that category >= 5"""

In [None]:
"""SQL table creation and data insertion"""
drop table if exists  film ;
drop table if exists  category  ; 
drop table if exists  film_category  ; 

CREATE TABLE IF NOT EXISTS film (
  film_id smallint(5)  NOT NULL DEFAULT '0',
  title varchar(255) NOT NULL,
  description text,
  PRIMARY KEY (film_id));

CREATE TABLE category  (
   category_id  tinyint(3)  NOT NULL ,
   name  varchar(25) NOT NULL, `last_update` timestamp,
  PRIMARY KEY ( category_id ));

CREATE TABLE film_category  (
   film_id  smallint(5)  NOT NULL,
   category_id  tinyint(3)  NOT NULL, `last_update` timestamp);

INSERT INTO film VALUES(1,'ACADEMY DINOSAUR','A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies');
INSERT INTO film VALUES(2,'ACE GOLDFINGER','A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China');
INSERT INTO film VALUES(3,'ADAPTATION HOLES','A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory');
INSERT INTO film VALUES(4,'AFFAIR PREJUDICE','A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank');
INSERT INTO film VALUES(5,'AFRICAN EGG','A Fast-Paced Documentary of a Pastry Chef And a Dentist who must Pursue a Forensic Psychologist in The Gulf of Mexico');
INSERT INTO film VALUES(6,'AGENT TRUMAN','A Intrepid Panorama of a robot And a Boy who must Escape a Sumo Wrestler in Ancient China');
INSERT INTO film VALUES(7,'AIRPLANE SIERRA','A Touching Saga of a Hunter And a Butler who must Discover a Butler in A Jet Boat');
INSERT INTO film VALUES(8,'AIRPORT POLLOCK','A Epic Tale of a Moose And a Girl who must Confront a Monkey in Ancient India');
INSERT INTO film VALUES(9,'ALABAMA DEVIL','A Thoughtful Panorama of a Database Administrator And a Mad Scientist who must Outgun a Mad Scientist in A Jet Boat');
INSERT INTO film VALUES(10,'ALADDIN CALENDAR','A Action-Packed Tale of a Man And a Lumberjack who must Reach a Feminist in Ancient China');

INSERT INTO category VALUES(1,'Action','2006-02-14 20:46:27');
INSERT INTO category VALUES(2,'Animation','2006-02-14 20:46:27');
INSERT INTO category VALUES(3,'Children','2006-02-14 20:46:27');
INSERT INTO category VALUES(4,'Classics','2006-02-14 20:46:27');
INSERT INTO category VALUES(5,'Comedy','2006-02-14 20:46:27');
INSERT INTO category VALUES(6,'Documentary','2006-02-14 20:46:27');
INSERT INTO category VALUES(7,'Drama','2006-02-14 20:46:27');
INSERT INTO category VALUES(8,'Family','2006-02-14 20:46:27');
INSERT INTO category VALUES(9,'Foreign','2006-02-14 20:46:27');
INSERT INTO category VALUES(10,'Games','2006-02-14 20:46:27');
INSERT INTO category VALUES(11,'Horror','2006-02-14 20:46:27');
INSERT INTO category VALUES(12,'Music','2006-02-14 20:46:27');
INSERT INTO category VALUES(13,'New','2006-02-14 20:46:27');
INSERT INTO category VALUES(14,'Sci-Fi','2006-02-14 20:46:27');
INSERT INTO category VALUES(15,'Sports','2006-02-14 20:46:27');
INSERT INTO category VALUES(16,'Travel','2006-02-14 20:46:27');

INSERT INTO film_category VALUES(1,6,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(2,11,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(3,6,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(4,11,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(5,6,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(6,6,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(7,5,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(8,6,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(9,11,'2006-02-14 21:07:09');
INSERT INTO film_category VALUES(10,15,'2006-02-14 21:07:09');

In [None]:
"""SQL solution"""
SELECT a.name, COUNT(*)
FROM
(SELECT f.film_id, f.description, c.name, fc.category_id
FROM film AS f
INNER JOIN film_category AS fc
ON f.film_id = fc.film_id
INNER JOIN category AS c
ON fc.category_id = c.category_id) AS a
WHERE a.description LIKE '%robot%'
AND a.category_id IN
(SELECT category_id FROM film_category GROUP BY category_id HAVING COUNT(*)>=5)
GROUP BY a.category_id

In [16]:
"""pandas dataframe creation"""
import pandas as pd

# in order to process a large table
# copy and paaste the SQL code for table creation and data insertion into data.txt
film = pd.read_csv('data.txt', sep=",", header=None)
film.columns = ['film_id', 'title', 'description']
for i in range(len(film)):
    film.iloc[i, 0] = film.iloc[i, 0][24:]
    film.iloc[i, 1] = film.iloc[i, 1][1:-1]
    film.iloc[i, -1] = film.iloc[i, -1][1:-3]
film.head()

Unnamed: 0,film_id,title,description
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...


In [85]:
# in order to process a large table
# copy and paaste the SQL code for table creation and data insertion into data.txt
category = pd.read_csv('data.txt', sep=",", header=None)
category.columns = ['category_id', 'name', 'last_update']
for i in range(len(category)):
    category.iloc[i, 0] = category.iloc[i, 0][28:]
    category.iloc[i, 1] = category.iloc[i, 1][1:-1]
    category.iloc[i, -1] = category.iloc[i, -1][1:-3]
category['category_id'] = category['category_id'].astype('int')
category.head()

Unnamed: 0,category_id,name,last_update
0,1,Action,2006-02-14 20:46:27
1,2,Animation,2006-02-14 20:46:27
2,3,Children,2006-02-14 20:46:27
3,4,Classics,2006-02-14 20:46:27
4,5,Comedy,2006-02-14 20:46:27


In [46]:
# in order to process a large table
# copy and paaste the SQL code for table creation and data insertion into data.txt
film_category = pd.read_csv('data.txt', sep=",", header=None)
film_category.columns = ['film_id', 'category_id', 'last_update']
for i in range(len(film_category)):
    film_category.iloc[i, 0] = film_category.iloc[i, 0][33:]
    film_category.iloc[i, -1] = film_category.iloc[i, -1][1:-3]
film_category.head()

Unnamed: 0,film_id,category_id,last_update
0,1,6,2006-02-14 21:07:09
1,2,11,2006-02-14 21:07:09
2,3,6,2006-02-14 21:07:09
3,4,11,2006-02-14 21:07:09
4,5,6,2006-02-14 21:07:09


In [76]:
# find categories with number of films >= 5
category_cnt = pd.DataFrame(film_category['category_id'].value_counts()>=5).reset_index().rename(columns={'index': 'category_id', 'category_id': 'cnt>=5'})
category_cnt

Unnamed: 0,category_id,cnt>=5
0,6,True
1,11,False
2,15,False
3,5,False


In [77]:
category_cnt = category_cnt[category_cnt['cnt>=5']]
category_cnt 

Unnamed: 0,category_id,cnt>=5
0,6,True


In [94]:
# add category_id to each film
df = pd.merge(film, film_category[['film_id', 'category_id']], on='film_id', how='inner')
df

Unnamed: 0,film_id,title,description,category_id
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,6
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,11
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,6
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,11
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,6
5,6,AGENT TRUMAN,A Intrepid Panorama of a robot And a Boy who m...,6
6,7,AIRPLANE SIERRA,A Touching Saga of a Hunter And a Butler who m...,5
7,8,AIRPORT POLLOCK,A Epic Tale of a Moose And a Girl who must Con...,6
8,9,ALABAMA DEVIL,A Thoughtful Panorama of a Database Administra...,11
9,10,ALADDIN CALENDAR,A Action-Packed Tale of a Man And a Lumberjack...,15


In [95]:
# find films with 'robot' in their descriptions
df['robot'] = False
for i in range(len(df)):
    if 'robot' in df.iloc[i, 2]:
        df.iloc[i, 4] = True
df = df[df['robot']]
df

Unnamed: 0,film_id,title,description,category_id,robot
5,6,AGENT TRUMAN,A Intrepid Panorama of a robot And a Boy who m...,6,True


In [96]:
# filter by category_id with number of films >= 5
df = pd.merge(df, category_cnt['category_id'], on='category_id', how='inner')
# add category name to each film
df = pd.merge(df, category[['category_id', 'name']], on='category_id', how='inner')
df

Unnamed: 0,film_id,title,description,category_id,robot,name
0,6,AGENT TRUMAN,A Intrepid Panorama of a robot And a Boy who m...,6,True,Documentary


In [106]:
# count by category names
pd.DataFrame(df['name'].value_counts()).reset_index().rename(columns={'index':'category', 'name':'cnt'})

Unnamed: 0,category,cnt
0,Documentary,1
