In [1]:
import sqlite3
import pandas as pd
import json
import re    # regex module
from datetime import date, datetime, timedelta

In [2]:
f=open('students_info.json','r')
data=json.load(f)

In [3]:
conn = sqlite3.connect('students.db')
cursor = conn.cursor()

In [4]:
create_query = """ CREATE TABLE students
                    (NAME TEXT NOT NULL,
                    BIRTHDAY NUMERIC,
                    FAV_FOOD TEXT NOT NULL,
                    BIRTHPLACE TEXT NOT NULL,
                    YEARS_IN_NYC INTEGER
                    );
                """

In [5]:
cursor.execute('DROP TABLE IF EXISTS students;')
cursor.execute(create_query)

<sqlite3.Cursor at 0x7f848fe4bce0>

In [6]:
def date_format(birthday):
    spltd_bday = re.split('[/ -]|[.]', birthday)
    
    if len(spltd_bday)<3:
        return None
    else:
        m,d,y = spltd_bday
        date_obj = date(int(y),int(m),int(d))
        
        return datetime.strftime(date_obj, "%m/%d/%Y")

In [7]:
students_list = []

for student in data:
    if type(student['fav_food']) == list:
        for fav_food in student['fav_food']:
            student_tuple = (student['name'], 
                             date_format(student['birthday']),
                             fav_food,
                             student['birthplace'],
                             student['years_in_nyc'])
            students_list.append(student_tuple)
    else:
        student_tuple = (student['name'], 
                         date_format(student['birthday']),
                         student['fav_food'],
                         student['birthplace'],
                         student['years_in_nyc'])
        
        students_list.append(student_tuple)

insert_statement = '''INSERT INTO students (name, birthday, fav_food, birthplace, years_in_nyc) VALUES ( ?, ?, ?, ?, ? )'''

In [8]:
print(len(students_list))
students_list[0]

12


('Sean Abu Wilson', '02/06/1985', 'Bread', 'Birmingham, AL', 9)

In [9]:
cursor.executemany(insert_statement, students_list)

<sqlite3.Cursor at 0x7f848fe4bce0>

In [10]:
df = pd.DataFrame(cursor.execute("SELECT * FROM students").fetchall())
df.columns = [x[0] for x in cursor.description]
df
# cursor.execute("""SELECT * 
#                 FROM students
#                 ;""").fetchall()

Unnamed: 0,NAME,BIRTHDAY,FAV_FOOD,BIRTHPLACE,YEARS_IN_NYC
0,Sean Abu Wilson,02/06/1985,Bread,"Birmingham, AL",9
1,christa,01/11/1996,buffalo wings,new york,2
2,Dariga Kokenova,,Italian,Kazakhstan,14
3,Rafael Ferreira,10/13/1991,Linguine con Vongole,"Denver, CO",5
4,Saad Raees,05/13/1992,ice cream,"Karachi,Pakistan",10
5,Ivan Zakharchuk,01/27/1988,Mediterranean,Kiev,10
6,Anjanette Vanessa Jia Craynock Santiago Delgad...,10/03/1987,Japanese,"Carolina, Puerto Rico",5
7,Guy Monahan,07/21/1988,Dumplings,"Jersey City, NJ",31
8,Jonathan Silverman,08/04/1982,Pizza,"Brooklyn, NY",37
9,Nick,02/27/1988,Japanese,UK,10


# Which student was born closest to the cohort's graduation date?

In [11]:
# march 26 2021

In [12]:
cursor.execute("""SELECT * 
                FROM students 
                ORDER BY BIRTHDAY DESC
                ;""").fetchall()

[('Rafael Ferreira', '10/13/1991', 'Linguine con Vongole', 'Denver, CO', 5),
 ('Anjanette Vanessa Jia Craynock Santiago Delgado Betancourt Rivera',
  '10/03/1987',
  'Japanese',
  'Carolina, Puerto Rico',
  5),
 ('Jonathan Silverman', '08/04/1982', 'Pizza', 'Brooklyn, NY', 37),
 ('Guy Monahan', '07/21/1988', 'Dumplings', 'Jersey City, NJ', 31),
 ('Saad Raees', '05/13/1992', 'ice cream', 'Karachi,Pakistan', 10),
 ('Nick', '02/27/1988', 'Japanese', 'UK', 10),
 ('John', '02/27/1988', 'Pizza', 'UK', 10),
 ('Jennifer', '02/27/1988', 'Pizza', 'UK', 10),
 ('Sean Abu Wilson', '02/06/1985', 'Bread', 'Birmingham, AL', 9),
 ('Ivan Zakharchuk', '01/27/1988', 'Mediterranean', 'Kiev', 10),
 ('christa', '01/11/1996', 'buffalo wings', 'new york', 2),
 ('Dariga Kokenova', None, 'Italian', 'Kazakhstan', 14)]

# Which 3 students have lived in NYC the shortest amount of time?

In [13]:
cursor.execute("""SELECT * 
                FROM students 
                ORDER BY YEARS_IN_NYC
                LIMIT 3
                ;""").fetchall()

[('christa', '01/11/1996', 'buffalo wings', 'new york', 2),
 ('Rafael Ferreira', '10/13/1991', 'Linguine con Vongole', 'Denver, CO', 5),
 ('Anjanette Vanessa Jia Craynock Santiago Delgado Betancourt Rivera',
  '10/03/1987',
  'Japanese',
  'Carolina, Puerto Rico',
  5)]

# How many students are native New Yorkers?

In [14]:
cursor.execute("""SELECT COUNT()
                FROM students 
                WHERE BIRTHPLACE like '%NY%' 
                OR  
                BIRTHPLACE like '%new york%'
                ;""").fetchall()

[(2,)]

In [15]:
df

Unnamed: 0,NAME,BIRTHDAY,FAV_FOOD,BIRTHPLACE,YEARS_IN_NYC
0,Sean Abu Wilson,02/06/1985,Bread,"Birmingham, AL",9
1,christa,01/11/1996,buffalo wings,new york,2
2,Dariga Kokenova,,Italian,Kazakhstan,14
3,Rafael Ferreira,10/13/1991,Linguine con Vongole,"Denver, CO",5
4,Saad Raees,05/13/1992,ice cream,"Karachi,Pakistan",10
5,Ivan Zakharchuk,01/27/1988,Mediterranean,Kiev,10
6,Anjanette Vanessa Jia Craynock Santiago Delgad...,10/03/1987,Japanese,"Carolina, Puerto Rico",5
7,Guy Monahan,07/21/1988,Dumplings,"Jersey City, NJ",31
8,Jonathan Silverman,08/04/1982,Pizza,"Brooklyn, NY",37
9,Nick,02/27/1988,Japanese,UK,10


# Do any two students have the same favorite food?

In [16]:
cursor.execute("""SELECT FAV_FOOD , COUNT(*)
                FROM students 
                GROUP BY FAV_FOOD
                HAVING COUNT(*)>1
                ORDER BY COUNT(*) DESC 
                ;""").fetchone()

('Pizza', 3)

In [17]:
cursor.execute("""SELECT FAV_FOOD , COUNT(*)
                FROM students 
                GROUP BY FAV_FOOD
                ORDER BY COUNT(*) DESC 
            
                ;""").fetchall()

[('Pizza', 3),
 ('Japanese', 2),
 ('ice cream', 1),
 ('buffalo wings', 1),
 ('Mediterranean', 1),
 ('Linguine con Vongole', 1),
 ('Italian', 1),
 ('Dumplings', 1),
 ('Bread', 1)]

In [18]:
max_query = """SELECT FAV_FOOD, MAX(food_count)
    FROM (
            SELECT FAV_FOOD , COUNT(*) as food_count
            FROM students 
            GROUP BY FAV_FOOD
            ORDER BY COUNT(*) DESC 
        )
"""

In [19]:
cursor.execute(max_query)

<sqlite3.Cursor at 0x7f848fe4bce0>

In [20]:
cursor.fetchall()

[('Pizza', 3)]

In [21]:
above_avg = """
SELECT NAME 
FROM students
WHERE years_in_nyc > (SELECT avg(years_in_nyc) from students)
"""

In [22]:
cursor.execute(above_avg).fetchall()

[('Dariga Kokenova',), ('Guy Monahan',), ('Jonathan Silverman',)]

In [23]:
cursor.execute ('SELECT avg(years_in_nyc) from students').fetchall()

[(12.75,)]

In [24]:
def date_format(birthday):
    spltd_bday = re.split('[/ -]|[.]', birthday)
    
    if len(spltd_bday)<3:
        return None
    else:
        m,d,y = spltd_bday
        date_obj = date(int(y),int(m),int(d))
        
        return datetime.strftime(date_obj, "%m/%d/%Y")

In [25]:
date_format('01.27.1988')

'01/27/1988'