# Introducción a SQL con Pandas

### Setup

Primero importamos la librería Pandas y SQLite

In [1]:
import pandas as pd
import sqlite3 as sql

Ahora leemos la información de graduados de universidades desde el github de fivethirtyeight de Nate Silver

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv")
df.head()

Unnamed: 0,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,Employed,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,1,2419,PETROLEUM ENGINEERING,2339.0,2057.0,282.0,Engineering,0.120564,36,1976,...,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,2,2416,MINING AND MINERAL ENGINEERING,756.0,679.0,77.0,Engineering,0.101852,7,640,...,170,388,85,0.117241,75000,55000,90000,350,257,50
2,3,2415,METALLURGICAL ENGINEERING,856.0,725.0,131.0,Engineering,0.153037,3,648,...,133,340,16,0.024096,73000,50000,105000,456,176,0
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,1123.0,135.0,Engineering,0.107313,16,758,...,150,692,40,0.050125,70000,43000,80000,529,102,0
4,5,2405,CHEMICAL ENGINEERING,32260.0,21239.0,11021.0,Engineering,0.341631,289,25694,...,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 21 columns):
Rank                    173 non-null int64
Major_code              173 non-null int64
Major                   173 non-null object
Total                   172 non-null float64
Men                     172 non-null float64
Women                   172 non-null float64
Major_category          173 non-null object
ShareWomen              172 non-null float64
Sample_size             173 non-null int64
Employed                173 non-null int64
Full_time               173 non-null int64
Part_time               173 non-null int64
Full_time_year_round    173 non-null int64
Unemployed              173 non-null int64
Unemployment_rate       173 non-null float64
Median                  173 non-null int64
P25th                   173 non-null int64
P75th                   173 non-null int64
College_jobs            173 non-null int64
Non_college_jobs        173 non-null int64
Low_wage_jobs          

Ahora crearemos nuestra base de datos a partir del DataFrame que acabamos de descargar, primero establecemos la conexión a nuestra base de datos

In [4]:
con = sql.connect("clase3.db")

y generamos un cursor que nos ayude a manipular esta base de datos con SQL

In [5]:
cur = con.cursor()

El siguiente paso es convertir nuestro DataFrame en una tabla de la base de datos

In [6]:
df.to_sql?

In [8]:
df.to_sql('recent_grads',con)

Ahora podemos ejecutar sentencias SQL de dos formas, directo de la base de datos con cur.execute

In [9]:
cur.execute('''
SELECT * FROM recent_grads;
''')

<sqlite3.Cursor at 0x1094d0a40>

In [10]:
cur.fetchall() # Nos muestra todos los resultados de nuestra Query

[(0,
  1,
  2419,
  'PETROLEUM ENGINEERING',
  2339.0,
  2057.0,
  282.0,
  'Engineering',
  0.120564344,
  36,
  1976,
  1849,
  270,
  1207,
  37,
  0.018380527,
  110000,
  95000,
  125000,
  1534,
  364,
  193),
 (1,
  2,
  2416,
  'MINING AND MINERAL ENGINEERING',
  756.0,
  679.0,
  77.0,
  'Engineering',
  0.10185185199999999,
  7,
  640,
  556,
  170,
  388,
  85,
  0.117241379,
  75000,
  55000,
  90000,
  350,
  257,
  50),
 (2,
  3,
  2415,
  'METALLURGICAL ENGINEERING',
  856.0,
  725.0,
  131.0,
  'Engineering',
  0.153037383,
  3,
  648,
  558,
  133,
  340,
  16,
  0.024096386,
  73000,
  50000,
  105000,
  456,
  176,
  0),
 (3,
  4,
  2417,
  'NAVAL ARCHITECTURE AND MARINE ENGINEERING',
  1258.0,
  1123.0,
  135.0,
  'Engineering',
  0.107313196,
  16,
  758,
  1069,
  150,
  692,
  40,
  0.050125313,
  70000,
  43000,
  80000,
  529,
  102,
  0),
 (4,
  5,
  2405,
  'CHEMICAL ENGINEERING',
  32260.0,
  21239.0,
  11021.0,
  'Engineering',
  0.341630502,
  289,
  25694

o bien utilizando read_sql_query que nos regresa un DataFrame. En este notebook utilizaremos el segundo porque nos arroja un output más legible

In [11]:
pd.read_sql_query("SELECT * FROM recent_grads", con)

Unnamed: 0,index,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,0,1,2419,PETROLEUM ENGINEERING,2339.0,2057.0,282.0,Engineering,0.120564,36,...,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,1,2,2416,MINING AND MINERAL ENGINEERING,756.0,679.0,77.0,Engineering,0.101852,7,...,170,388,85,0.117241,75000,55000,90000,350,257,50
2,2,3,2415,METALLURGICAL ENGINEERING,856.0,725.0,131.0,Engineering,0.153037,3,...,133,340,16,0.024096,73000,50000,105000,456,176,0
3,3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,1123.0,135.0,Engineering,0.107313,16,...,150,692,40,0.050125,70000,43000,80000,529,102,0
4,4,5,2405,CHEMICAL ENGINEERING,32260.0,21239.0,11021.0,Engineering,0.341631,289,...,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972
5,5,6,2418,NUCLEAR ENGINEERING,2573.0,2200.0,373.0,Engineering,0.144967,17,...,264,1449,400,0.177226,65000,50000,102000,1142,657,244
6,6,7,6202,ACTUARIAL SCIENCE,3777.0,2110.0,1667.0,Business,0.441356,51,...,296,2482,308,0.095652,62000,53000,72000,1768,314,259
7,7,8,5001,ASTRONOMY AND ASTROPHYSICS,1792.0,832.0,960.0,Physical Sciences,0.535714,10,...,553,827,33,0.021167,62000,31500,109000,972,500,220
8,8,9,2414,MECHANICAL ENGINEERING,91227.0,80320.0,10907.0,Engineering,0.119559,1029,...,13101,54639,4650,0.057342,60000,48000,70000,52844,16384,3253
9,9,10,2408,ELECTRICAL ENGINEERING,81527.0,65511.0,16016.0,Engineering,0.196450,631,...,12695,41413,3895,0.059174,60000,45000,72000,45829,10874,3170


### Recent Grads

Primero podemos imprimir las primeras 5 observaciones de la base de datos (similar al método head)

In [12]:
pd.read_sql_query("SELECT * FROM recent_grads LIMIT 5;", con)

Unnamed: 0,index,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,0,1,2419,PETROLEUM ENGINEERING,2339.0,2057.0,282.0,Engineering,0.120564,36,...,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,1,2,2416,MINING AND MINERAL ENGINEERING,756.0,679.0,77.0,Engineering,0.101852,7,...,170,388,85,0.117241,75000,55000,90000,350,257,50
2,2,3,2415,METALLURGICAL ENGINEERING,856.0,725.0,131.0,Engineering,0.153037,3,...,133,340,16,0.024096,73000,50000,105000,456,176,0
3,3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,1123.0,135.0,Engineering,0.107313,16,...,150,692,40,0.050125,70000,43000,80000,529,102,0
4,4,5,2405,CHEMICAL ENGINEERING,32260.0,21239.0,11021.0,Engineering,0.341631,289,...,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972


#### 1. Escribe una sentencia de SQL que regrese las primeras 10 filas

In [13]:
pd.read_sql_query("SELECT * FROM recent_grads LIMIT 10;", con)

Unnamed: 0,index,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,0,1,2419,PETROLEUM ENGINEERING,2339.0,2057.0,282.0,Engineering,0.120564,36,...,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,1,2,2416,MINING AND MINERAL ENGINEERING,756.0,679.0,77.0,Engineering,0.101852,7,...,170,388,85,0.117241,75000,55000,90000,350,257,50
2,2,3,2415,METALLURGICAL ENGINEERING,856.0,725.0,131.0,Engineering,0.153037,3,...,133,340,16,0.024096,73000,50000,105000,456,176,0
3,3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,1123.0,135.0,Engineering,0.107313,16,...,150,692,40,0.050125,70000,43000,80000,529,102,0
4,4,5,2405,CHEMICAL ENGINEERING,32260.0,21239.0,11021.0,Engineering,0.341631,289,...,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972
5,5,6,2418,NUCLEAR ENGINEERING,2573.0,2200.0,373.0,Engineering,0.144967,17,...,264,1449,400,0.177226,65000,50000,102000,1142,657,244
6,6,7,6202,ACTUARIAL SCIENCE,3777.0,2110.0,1667.0,Business,0.441356,51,...,296,2482,308,0.095652,62000,53000,72000,1768,314,259
7,7,8,5001,ASTRONOMY AND ASTROPHYSICS,1792.0,832.0,960.0,Physical Sciences,0.535714,10,...,553,827,33,0.021167,62000,31500,109000,972,500,220
8,8,9,2414,MECHANICAL ENGINEERING,91227.0,80320.0,10907.0,Engineering,0.119559,1029,...,13101,54639,4650,0.057342,60000,48000,70000,52844,16384,3253
9,9,10,2408,ELECTRICAL ENGINEERING,81527.0,65511.0,16016.0,Engineering,0.19645,631,...,12695,41413,3895,0.059174,60000,45000,72000,45829,10874,3170


Ahora podemos ver que hay maneras adicionales para pedirle que nos regrese una columna en particular

In [14]:
pd.read_sql_query("SELECT Major FROM recent_grads LIMIT 5;", con)

Unnamed: 0,Major
0,PETROLEUM ENGINEERING
1,MINING AND MINERAL ENGINEERING
2,METALLURGICAL ENGINEERING
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING
4,CHEMICAL ENGINEERING


Del mismo modo podemos pedirle que nos regrese un par de columnas si separamos por comas

In [15]:
pd.read_sql_query("SELECT Major, Major_category FROM recent_grads LIMIT 5;",con)

Unnamed: 0,Major,Major_category
0,PETROLEUM ENGINEERING,Engineering
1,MINING AND MINERAL ENGINEERING,Engineering
2,METALLURGICAL ENGINEERING,Engineering
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering
4,CHEMICAL ENGINEERING,Engineering


Estas querys nos regresan toda la columna de Major y Major category, pero podemos pedirle que sólo nos regrese aquellos records donde la proporción de mujeres (Sharewomen) es preponderante (>=0.5)

In [16]:
pd.read_sql_query("""
SELECT Major FROM recent_grads
WHERE ShareWomen >= 0.5 LIMIT 5;
""", con)

Unnamed: 0,Major
0,ASTRONOMY AND ASTROPHYSICS
1,PUBLIC POLICY
2,NURSING
3,"NUCLEAR, INDUSTRIAL RADIOLOGY, AND BIOLOGICAL ..."
4,ACCOUNTING


#### 2. Selecciona con una Query que regrese las Majors donde las mujeres son minoría, sólo regresa las columnas de Major y ShareWomen (en ese orden) y limita las filas regresadas a 10

In [17]:
pd.read_sql_query("""
SELECT Major, ShareWomen FROM recent_grads
WHERE ShareWomen < 0.5
LIMIT 10;
""", con)

Unnamed: 0,Major,ShareWomen
0,PETROLEUM ENGINEERING,0.120564
1,MINING AND MINERAL ENGINEERING,0.101852
2,METALLURGICAL ENGINEERING,0.153037
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,0.107313
4,CHEMICAL ENGINEERING,0.341631
5,NUCLEAR ENGINEERING,0.144967
6,ACTUARIAL SCIENCE,0.441356
7,MECHANICAL ENGINEERING,0.119559
8,ELECTRICAL ENGINEERING,0.19645
9,COMPUTER ENGINEERING,0.199413


Podemos seleccionar sin ningún problema las filas de una base de datos filtrando por más de una condición de la siguiente forma

In [18]:
pd.read_sql_query("""
SELECT * FROM recent_grads
WHERE (Major_category = 'Engineering') AND (ShareWomen > 0.2);
""", con)

Unnamed: 0,index,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,4,5,2405,CHEMICAL ENGINEERING,32260.0,21239.0,11021.0,Engineering,0.341631,289,...,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972
1,12,13,2404,BIOMEDICAL ENGINEERING,14955.0,8407.0,6548.0,Engineering,0.437847,79,...,2694,5986,1019,0.092084,60000,36000,70000,6439,2471,789
2,13,14,5008,MATERIALS SCIENCE,4279.0,2949.0,1330.0,Engineering,0.31082,22,...,878,1967,78,0.023043,60000,39000,65000,2626,391,81
3,15,16,2402,BIOLOGICAL ENGINEERING,8925.0,6062.0,2863.0,Engineering,0.320784,55,...,1983,3413,589,0.087143,57100,40000,76000,3603,1595,524
4,16,17,2412,INDUSTRIAL AND MANUFACTURING ENGINEERING,18968.0,12453.0,6515.0,Engineering,0.343473,183,...,2243,11326,699,0.042876,57000,37900,67000,8306,3235,640
5,17,18,2400,GENERAL ENGINEERING,61152.0,45683.0,15469.0,Engineering,0.25296,425,...,7199,33540,2859,0.059824,56000,36000,69000,26898,11734,3192
6,18,19,2403,ARCHITECTURAL ENGINEERING,2825.0,1835.0,990.0,Engineering,0.350442,26,...,343,1848,170,0.061931,54000,38000,65000,1665,649,137
7,22,23,2502,ELECTRICAL ENGINEERING TECHNOLOGY,11565.0,8181.0,3384.0,Engineering,0.292607,97,...,1873,5681,824,0.087557,52000,35000,60000,5126,2686,696
8,23,24,2413,MATERIALS ENGINEERING AND MATERIALS SCIENCE,2993.0,2020.0,973.0,Engineering,0.325092,22,...,1040,1151,70,0.027789,52000,35000,62000,1911,305,70
9,25,26,2406,CIVIL ENGINEERING,53153.0,41081.0,12072.0,Engineering,0.227118,565,...,10080,29196,3270,0.07061,50000,40000,60000,28526,9356,2899


#### 3. Escribe una query que regrese todas las majors donde la mayoría son mujeres y todas las majors donde la mediana del salario es mayor que 50000, incluye las siguientes columnas en los resultados: Major, Major_category, Median, ShareWomen

In [10]:
pd.read_sql_query("""
SELECT Major, Major_category, Median, ShareWomen FROM recent_grads
WHERE (ShareWomen>0.5) OR (Median>50000);
""", con)

Unnamed: 0,Major,Major_category,Median,ShareWomen
0,PETROLEUM ENGINEERING,Engineering,110000,0.120564
1,MINING AND MINERAL ENGINEERING,Engineering,75000,0.101852
2,METALLURGICAL ENGINEERING,Engineering,73000,0.153037
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,70000,0.107313
4,CHEMICAL ENGINEERING,Engineering,65000,0.341631
5,NUCLEAR ENGINEERING,Engineering,65000,0.144967
6,ACTUARIAL SCIENCE,Business,62000,0.441356
7,ASTRONOMY AND ASTROPHYSICS,Physical Sciences,62000,0.535714
8,MECHANICAL ENGINEERING,Engineering,60000,0.119559
9,ELECTRICAL ENGINEERING,Engineering,60000,0.196450


Así como utilizamos el operador AND en el ejercicio pasado podemos sin problemas utilizar el operador OR

#### 4. Escribe una query que regrese las primeras 20 majors que 1) tengan una mediana salarial mayor o igual a 10,000 ó 2) tengan menos o igual a 1,000 personas desempleadas (unemployed). Incluye las siguientes columnas en los resultados: Major, Median, Unemployed

In [20]:
pd.read_sql_query("""
SELECT Major, Median, Unemployed FROM recent_grads
WHERE (Median>=10000) OR (Unemployed<=1000)
LIMIT 20;
""", con)

Unnamed: 0,Major,Median,Unemployed
0,PETROLEUM ENGINEERING,110000,37
1,MINING AND MINERAL ENGINEERING,75000,85
2,METALLURGICAL ENGINEERING,73000,16
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,70000,40
4,CHEMICAL ENGINEERING,65000,1672
5,NUCLEAR ENGINEERING,65000,400
6,ACTUARIAL SCIENCE,62000,308
7,ASTRONOMY AND ASTROPHYSICS,62000,33
8,MECHANICAL ENGINEERING,60000,4650
9,ELECTRICAL ENGINEERING,60000,3895


Para escribir lógica más compleja debemos agrupar nuestras condiciones en paréntesis, así podemos correr lo siguiente

In [21]:
pd.read_sql_query("""
SELECT Major, Major_category, ShareWomen, Unemployment_rate
FROM recent_grads
WHERE (Major_category = 'Engineering') AND (ShareWomen > 0.5 OR Unemployment_rate < 0.051);
""", con)

Unnamed: 0,Major,Major_category,ShareWomen,Unemployment_rate
0,PETROLEUM ENGINEERING,Engineering,0.120564,0.018381
1,METALLURGICAL ENGINEERING,Engineering,0.153037,0.024096
2,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,0.107313,0.050125
3,MATERIALS SCIENCE,Engineering,0.31082,0.023043
4,ENGINEERING MECHANICS PHYSICS AND SCIENCE,Engineering,0.183985,0.006334
5,INDUSTRIAL AND MANUFACTURING ENGINEERING,Engineering,0.343473,0.042876
6,MATERIALS ENGINEERING AND MATERIALS SCIENCE,Engineering,0.325092,0.027789
7,INDUSTRIAL PRODUCTION TECHNOLOGIES,Engineering,0.24919,0.028308
8,ENGINEERING AND INDUSTRIAL MANAGEMENT,Engineering,0.174123,0.033652


#### 5. Escribe una query que regrese todos los Engineering majors que: tienen más de 20% de mujeres ó tienen una tasa de desempleo menor al 5.1%. Regresa en los resultados las columnas: Major, Major_category, ShareWomen, Unemployment_rate

In [22]:
pd.read_sql_query("""
SELECT Major, Major_category, ShareWomen, Unemployment_rate FROM recent_grads
WHERE (Major_category='Engineering') AND (ShareWomen>0.2 OR Unemployment_rate < 0.051);
""", con)

Unnamed: 0,Major,Major_category,ShareWomen,Unemployment_rate
0,PETROLEUM ENGINEERING,Engineering,0.120564,0.018381
1,METALLURGICAL ENGINEERING,Engineering,0.153037,0.024096
2,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,0.107313,0.050125
3,CHEMICAL ENGINEERING,Engineering,0.341631,0.061098
4,BIOMEDICAL ENGINEERING,Engineering,0.437847,0.092084
5,MATERIALS SCIENCE,Engineering,0.31082,0.023043
6,ENGINEERING MECHANICS PHYSICS AND SCIENCE,Engineering,0.183985,0.006334
7,BIOLOGICAL ENGINEERING,Engineering,0.320784,0.087143
8,INDUSTRIAL AND MANUFACTURING ENGINEERING,Engineering,0.343473,0.042876
9,GENERAL ENGINEERING,Engineering,0.25296,0.059824


Podemos querer además de obtener los resultados ordenarlos de tal manera que nos parezcan más legibles, esto se consigue aplicando la instrucción ORDER BY

In [23]:
pd.read_sql_query("""
SELECT Major, Major_category, ShareWomen, Unemployment_rate FROM recent_grads
WHERE (Major_category='Engineering') AND (ShareWomen>0.2 OR Unemployment_rate < 0.051)
ORDER BY Unemployment_rate;
""", con)

Unnamed: 0,Major,Major_category,ShareWomen,Unemployment_rate
0,ENGINEERING MECHANICS PHYSICS AND SCIENCE,Engineering,0.183985,0.006334
1,PETROLEUM ENGINEERING,Engineering,0.120564,0.018381
2,MATERIALS SCIENCE,Engineering,0.31082,0.023043
3,METALLURGICAL ENGINEERING,Engineering,0.153037,0.024096
4,MATERIALS ENGINEERING AND MATERIALS SCIENCE,Engineering,0.325092,0.027789
5,INDUSTRIAL PRODUCTION TECHNOLOGIES,Engineering,0.24919,0.028308
6,ENGINEERING AND INDUSTRIAL MANAGEMENT,Engineering,0.174123,0.033652
7,INDUSTRIAL AND MANUFACTURING ENGINEERING,Engineering,0.343473,0.042876
8,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,0.107313,0.050125
9,MISCELLANEOUS ENGINEERING TECHNOLOGIES,Engineering,0.200023,0.052539


Si quisieramos ordenar descendentemente, entonces escribiríamos 'DESC'

In [25]:
pd.read_sql_query("""
SELECT Major, Major_category, ShareWomen, Unemployment_rate FROM recent_grads
WHERE (Major_category='Engineering') AND (ShareWomen>0.2 OR Unemployment_rate < 0.051)
ORDER BY Unemployment_rate DESC;
""", con)

Unnamed: 0,Major,Major_category,ShareWomen,Unemployment_rate
0,ARCHITECTURE,Engineering,0.451465,0.113332
1,ENVIRONMENTAL ENGINEERING,Engineering,0.342229,0.093589
2,BIOMEDICAL ENGINEERING,Engineering,0.437847,0.092084
3,ELECTRICAL ENGINEERING TECHNOLOGY,Engineering,0.292607,0.087557
4,BIOLOGICAL ENGINEERING,Engineering,0.320784,0.087143
5,GEOLOGICAL AND GEOPHYSICAL ENGINEERING,Engineering,0.322222,0.075038
6,CIVIL ENGINEERING,Engineering,0.227118,0.07061
7,ARCHITECTURAL ENGINEERING,Engineering,0.350442,0.061931
8,CHEMICAL ENGINEERING,Engineering,0.341631,0.061098
9,GENERAL ENGINEERING,Engineering,0.25296,0.059824


#### 6. Escribe una query que regrese todas las majors donde la proporción de mujeres es mayor a 30% y la tasa de desempleo es menor a 10%, incluye la siguientes columnas en tus resultados Major, ShareWomen, Unemployment_rate y finalmente orden tus resultados en orden descendente respecto a ShareWomen

In [32]:
pd.read_sql_query("""
SELECT Major, ShareWomen, Unemployment_rate FROM recent_grads
WHERE ShareWomen>0.3 AND Unemployment_rate<0.1
ORDER BY ShareWomen DESC;
""", con)

Unnamed: 0,Major,ShareWomen,Unemployment_rate
0,EARLY CHILDHOOD EDUCATION,0.968954,0.040105
1,COMMUNICATION DISORDERS SCIENCES AND SERVICES,0.967998,0.047584
2,MEDICAL ASSISTING SERVICES,0.927807,0.042507
3,ELEMENTARY EDUCATION,0.923745,0.046586
4,FAMILY AND CONSUMER SCIENCES,0.910933,0.067128
5,SPECIAL NEEDS EDUCATION,0.906677,0.041508
6,HUMAN SERVICES AND COMMUNITY ORGANIZATION,0.905590,0.037819
7,SOCIAL WORK,0.904075,0.068828
8,NURSING,0.896019,0.044863
9,MISCELLANEOUS HEALTH MEDICAL PROFESSIONS,0.881294,0.081411


#### 7. Escribe una query que regrese todas las majors que sean 'Engineering' o 'Physical Sciences' en orden ascendente de tasas de desempleo (Unemployment_rate). Regresa en tus resultados las columnas Major_category, Major y Unemployment Rate

In [33]:
pd.read_sql_query("""
SELECT Major_category, Major, Unemployment_rate FROM recent_grads
WHERE Major_category='Engineering' OR Major_category="Physical Sciences"
ORDER BY Unemployment_rate ASC;
""", con)

Unnamed: 0,Major_category,Major,Unemployment_rate
0,Engineering,ENGINEERING MECHANICS PHYSICS AND SCIENCE,0.006334
1,Engineering,PETROLEUM ENGINEERING,0.018381
2,Physical Sciences,ASTRONOMY AND ASTROPHYSICS,0.021167
3,Physical Sciences,ATMOSPHERIC SCIENCES AND METEOROLOGY,0.022229
4,Engineering,MATERIALS SCIENCE,0.023043
5,Engineering,METALLURGICAL ENGINEERING,0.024096
6,Physical Sciences,GEOSCIENCES,0.024374
7,Engineering,MATERIALS ENGINEERING AND MATERIALS SCIENCE,0.027789
8,Engineering,INDUSTRIAL PRODUCTION TECHNOLOGIES,0.028308
9,Engineering,ENGINEERING AND INDUSTRIAL MANAGEMENT,0.033652


#### Estadística descriptiva

Hasta ahora hemos regresado, filtrado y ordenado sólo los datos de la tabla, pero para resolver preguntas más interesantes tenemos que poder realizar operaciones con estos datos

Por ejemplo, queremos saber cuantas Majors tienen más de 25% de mujeres

Si escribimos

In [34]:
pd.read_sql_query("""
SELECT Major FROM recent_grads
WHERE ShareWomen > 0.25;
""", con)

Unnamed: 0,Major
0,CHEMICAL ENGINEERING
1,ACTUARIAL SCIENCE
2,ASTRONOMY AND ASTROPHYSICS
3,BIOMEDICAL ENGINEERING
4,MATERIALS SCIENCE
5,BIOLOGICAL ENGINEERING
6,INDUSTRIAL AND MANUFACTURING ENGINEERING
7,GENERAL ENGINEERING
8,ARCHITECTURAL ENGINEERING
9,ELECTRICAL ENGINEERING TECHNOLOGY


sin embargo lo más efectivo para saber cuántas majors son en total es utilizar la función COUNT()

In [35]:
pd.read_sql_query("""
SELECT COUNT(Major) FROM recent_grads
WHERE ShareWomen > 0.25;
""", con)

Unnamed: 0,COUNT(Major)
0,146


#### 8. Cuenta cuántas majors existen donde la proporción de mujeres sea menor al 30%

In [36]:
pd.read_sql_query("""
SELECT COUNT(Major) FROM recent_grads
WHERE ShareWomen < 0.3;
""", con)

Unnamed: 0,COUNT(Major)
0,34


Otras funciones útiles son MIN() y MAX(), podemos escribir

In [11]:
pd.read_sql_query("""
SELECT Major, MIN(ShareWomen) 
FROM recent_grads;
""", con)

Unnamed: 0,Major,MIN(ShareWomen)
0,MILITARY TECHNOLOGIES,0.0


Es interesante que haya una Major que no tiene mujeres en nuestro dataset

#### 9. Escribe una query que regrese la major de Engineering que tenga el menor salario mediano. Regresa sólo las columnas Major, Major_category y MIN(Median) en tus resultados

In [38]:
pd.read_sql_query("""
SELECT Major, Major_category, MIN(Median)
FROM recent_grads
WHERE Major_category = "Engineering";
""", con)

Unnamed: 0,Major,Major_category,MIN(Median)
0,ARCHITECTURE,Engineering,40000


Otras funciones útiles son SUM() y AVG()

#### 10. Escribe una query que regrese la suma de la columna 'Total' que contiene el total de estudiantes y una segunda query que nos regrese el promedio de la columna 'Total'

In [39]:
pd.read_sql_query("""
SELECT SUM(Total)
FROM recent_grads;
""", con)

Unnamed: 0,SUM(Total)
0,6771654.0


In [40]:
pd.read_sql_query("""
SELECT AVG(Total)
FROM recent_grads;
""", con)

Unnamed: 0,AVG(Total)
0,39370.081395


En vez de escribir una query individual para cada pregunta podemos escribir todas juntas

In [41]:
pd.read_sql_query("""
SELECT MIN(Median), MAX(Median), SUM(Total)
FROM recent_grads;
""", con)

Unnamed: 0,MIN(Median),MAX(Median),SUM(Total)
0,22000,110000,6771654.0


#### 11. Escribe una query que calcule el promedio de la columna Total, el mínimo de la columna Women y el máximo de la columna Women

In [12]:
pd.read_sql_query("""
SELECT AVG(Total), MIN(Women), MAX(Women)
FROM recent_grads;
""", con)

Unnamed: 0,AVG(Total),MIN(Women),MAX(Women)
0,39370.081395,0.0,307087.0


GROUP BY nos sirve para agrupar estadísticas por categoría

In [26]:
pd.read_sql_query("""
SELECT Major_category, SUM(Employed) 
FROM recent_grads 
GROUP BY Major_category;
""", con)

Unnamed: 0,Major_category,SUM(Employed)
0,Agriculture & Natural Resources,66943
1,Arts,288114
2,Biology & Life Science,302797
3,Business,1088742
4,Communications & Journalism,330660
5,Computers & Mathematics,237894
6,Education,479839
7,Engineering,420372
8,Health,372147
9,Humanities & Liberal Arts,544118


#### 12. Utiliza select para seleccionar las columnas Major_category y AVG(Share_Women) utiliza GROUP BY para agrupar por Major_category

In [44]:
pd.read_sql_query("""
SELECT Major_category, AVG(ShareWomen) FROM recent_grads
GROUP BY Major_category;
""", con)

Unnamed: 0,Major_category,AVG(ShareWomen)
0,Agriculture & Natural Resources,0.405267
1,Arts,0.603658
2,Biology & Life Science,0.587193
3,Business,0.483198
4,Communications & Journalism,0.658384
5,Computers & Mathematics,0.311772
6,Education,0.748507
7,Engineering,0.238889
8,Health,0.795152
9,Humanities & Liberal Arts,0.63179


Podemos renombrar las columnas calculadas con el operador AS de la siguiente forma

In [45]:
pd.read_sql_query("""
SELECT COUNT(*) AS num_students FROM recent_grads;
""", con)

Unnamed: 0,num_students
0,173


inclusive SQLite nos permite no escribir el operados AS si utilizamos un string

In [27]:
pd.read_sql_query("""
SELECT COUNT(*) "num students" FROM recent_grads;
""", con)

Unnamed: 0,num students
0,173


#### 13. Escribe una query que regrese el número de filas como "Número de estudiantes" y el máximo valor de Unemployment_rate como "Máxima tasa de desempleo"

In [48]:
pd.read_sql_query("""
SELECT COUNT(*) "Número de estudiantes", MAX(Unemployment_rate) "Máxima tasa de desempleo" FROM recent_grads;
""", con)

Unnamed: 0,Número de estudiantes,Máxima tasa de desempleo
0,173,0.177226


Si queremos obtener los valores únicos de una columna entonces podemos usar DISTINCT

In [50]:
pd.read_sql_query("""
SELECT DISTINCT Major_category FROM recent_grads;
""", con)

Unnamed: 0,Major_category
0,Engineering
1,Business
2,Physical Sciences
3,Law & Public Policy
4,Computers & Mathematics
5,Agriculture & Natural Resources
6,Industrial Arts & Consumer Services
7,Arts
8,Health
9,Social Science


si pasamos como argumento más de una columna, entonces nos buscará las combinaciones únicas. Ejemplo

In [52]:
pd.read_sql_query("""
SELECT DISTINCT Major, Major_category FROM recent_grads
limit 5;
""", con)

Unnamed: 0,Major,Major_category
0,PETROLEUM ENGINEERING,Engineering
1,MINING AND MINERAL ENGINEERING,Engineering
2,METALLURGICAL ENGINEERING,Engineering
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering
4,CHEMICAL ENGINEERING,Engineering


incluso podemos contar el número de valores únicos de una columna

In [53]:
pd.read_sql_query("""
SELECT COUNT(DISTINCT(Major_category)) unique_major_categories FROM recent_grads;
""", con)

Unnamed: 0,unique_major_categories
0,16


#### 14. Escribe una query que regrese el número total de valores únicos de las columnas 'Major', 'Major_category' y 'Major_code'. Coloca un alias por medio de un string a cada categoría

In [54]:
pd.read_sql_query("""
SELECT COUNT(DISTINCT(Major)) "unique_majors",
COUNT(DISTINCT(Major_category)) "unique_major_categories",
COUNT(DISTINCT(Major_code)) "unique_major_codes"
FROM recent_grads;
""", con)

Unnamed: 0,unique_majors,unique_major_categories,unique_major_codes
0,173,16,173


Otra pregunta interesante que ahora podemos contestar es ¿cuál de las majors tiene el mayor spread entre los salarios de los percentiles 75 y 25?

In [57]:
pd.read_sql_query("""
SELECT P75th - P25th quartile_spread, Major FROM recent_grads
LIMIT 10;
""", con)

Unnamed: 0,quartile_spread,Major
0,30000,PETROLEUM ENGINEERING
1,35000,MINING AND MINERAL ENGINEERING
2,55000,METALLURGICAL ENGINEERING
3,37000,NAVAL ARCHITECTURE AND MARINE ENGINEERING
4,25000,CHEMICAL ENGINEERING
5,52000,NUCLEAR ENGINEERING
6,19000,ACTUARIAL SCIENCE
7,77500,ASTRONOMY AND ASTROPHYSICS
8,22000,MECHANICAL ENGINEERING
9,27000,ELECTRICAL ENGINEERING


#### 15. Escribe una query que muestre la diferencia entre los percentiles 25 y 75 de salarios para todas las majors. Regresa en tu query primero la columna Major y Major_category utilizando el nombre de la columna por default. Asimismo, calcula la diferencia entre los percentiles 25 y 75 utilizando el alias quartile_spread. Finalmente, ordena los resultados de menor a mayor y regresa solo los primeros 20 resultados

In [58]:
pd.read_sql_query("""
SELECT Major, Major_category, P75th - P25th "quartile_spread" FROM recent_grads
ORDER BY quartile_spread ASC
LIMIT 20;
""", con)

Unnamed: 0,Major,Major_category,quartile_spread
0,MILITARY TECHNOLOGIES,Industrial Arts & Consumer Services,0
1,SCHOOL STUDENT COUNSELING,Education,2000
2,LIBRARY SCIENCE,Education,2000
3,COURT REPORTING,Law & Public Policy,4000
4,PHARMACOLOGY,Biology & Life Science,5000
5,EDUCATIONAL ADMINISTRATION AND SUPERVISION,Education,6000
6,COUNSELING PSYCHOLOGY,Psychology & Social Work,6800
7,SPECIAL NEEDS EDUCATION,Education,10000
8,MATHEMATICS TEACHER EDUCATION,Education,10000
9,SOCIAL WORK,Psychology & Social Work,10000


Algunas veces queremos seleccionar un conjunto de filas después de escribir un GROUP BY. Para estos casos podemos utilizar HAVING

In [59]:
pd.read_sql_query("""
SELECT Major_category, AVG(Employed) / AVG(Total) AS share_employed 
FROM recent_grads 
GROUP BY Major_category 
HAVING share_employed > .8;
""", con)

Unnamed: 0,Major_category,share_employed
0,Arts,0.806748
1,Business,0.835966
2,Communications & Journalism,0.842229
3,Education,0.85819
4,Health,0.803374
5,Industrial Arts & Consumer Services,0.82267
6,Law & Public Policy,0.808399


Fijense que SQL me permite utilizar los nombres personalizados en partes subsecuentes de mi Query incluyendo HAVING y WHERE

#### 16. Encuentra todas las categorías de major donde los graduados con salarios bajos son mayores al 10%. Utiliza SELECT para obtener Major_category, AVG(Low_wage_jobs)/AVG(Total) as share_low_wage, utiliza GROUP BY para agrupar por Major_category y utiliza HAVING para restringir la selección a las filas donde share_low_wage es mayor a 10%

In [60]:
pd.read_sql_query("""
SELECT Major_category, AVG(Low_wage_jobs)/AVG(Total) as "share_low_wage" FROM recent_grads
GROUP BY Major_category
HAVING share_low_wage>0.1;
""", con)

Unnamed: 0,Major_category,share_low_wage
0,Arts,0.168331
1,Communications & Journalism,0.126324
2,Humanities & Liberal Arts,0.132087
3,Industrial Arts & Consumer Services,0.115713
4,Law & Public Policy,0.115685
5,Psychology & Social Work,0.116934
6,Social Science,0.102233


Podemos utilizar la función ROUND para redondear valores. Ejemplo

In [62]:
pd.read_sql_query("""
SELECT ROUND(ShareWomen, 4), Major_category FROM recent_grads
LIMIT 10;
""", con)

Unnamed: 0,"ROUND(ShareWomen, 4)",Major_category
0,0.1206,Engineering
1,0.1019,Engineering
2,0.153,Engineering
3,0.1073,Engineering
4,0.3416,Engineering
5,0.145,Engineering
6,0.4414,Business
7,0.5357,Physical Sciences
8,0.1196,Engineering
9,0.1965,Engineering


#### 17. Utiliza select para seleccionar Major_category y calcula AVG(College_jobs)/AVG(Total) utilizando el alias de "share_degree_jobs" además utiliza la función ROUND para redondear a 3 decimales. Asimismo, agrupa por Major_category utilizando GROUP BY y selecciona sólo las filas donde "share_degree_jobs" sea menor al 30%

In [63]:
pd.read_sql_query("""
SELECT Major_category, ROUND(AVG(College_jobs)/AVG(Total), 3) as "share_degree_jobs" FROM recent_grads
GROUP BY Major_category
HAVING share_degree_jobs < 0.3
""", con)

Unnamed: 0,Major_category,share_degree_jobs
0,Agriculture & Natural Resources,0.236
1,Arts,0.265
2,Business,0.114
3,Communications & Journalism,0.22
4,Humanities & Liberal Arts,0.27
5,Industrial Arts & Consumer Services,0.249
6,Law & Public Policy,0.163
7,Social Science,0.215


Podemos conocer algunos de los metadatos de nuestra tabla si corremos el siguiente comando

In [64]:
pd.read_sql_query("""
PRAGMA TABLE_INFO(recent_grads);
""", con)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,Rank,INTEGER,0,,0
2,2,Major_code,INTEGER,0,,0
3,3,Major,TEXT,0,,0
4,4,Total,REAL,0,,0
5,5,Men,REAL,0,,0
6,6,Women,REAL,0,,0
7,7,Major_category,TEXT,0,,0
8,8,ShareWomen,REAL,0,,0
9,9,Sample_size,INTEGER,0,,0


Veamos que si nosotros calculamos un ratio entre dos datos del tipo integer, entonces SQLite va a redondear hacia abajo para quedarnos con la parte entera

In [73]:
pd.read_sql_query("""
SELECT Full_time, Employed, Full_time/Employed FROM recent_grads
LIMIT 5;
""", con)

Unnamed: 0,Full_time,Employed,Full_time/Employed
0,1849,1976,0
1,556,640,0
2,558,648,0
3,1069,758,1
4,23170,25694,0


Para obtener el resultado correcto tenemos que utilizar CAST() para cambiar el tipo de dato en nuestra operación

In [77]:
pd.read_sql_query("""
SELECT Full_time, Employed, CAST(Full_time AS Float)/CAST(Employed AS Float) "Share_full_time" FROM recent_grads
LIMIT 5;
""", con)

Unnamed: 0,Full_time,Employed,Share_full_time
0,1849,1976,0.935729
1,556,640,0.86875
2,558,648,0.861111
3,1069,758,1.41029
4,23170,25694,0.901767


#### 18. Escribe una query que seleccione las columnas Major_category y que calcule la proporción de empleados de tiempo completo como "Share_full_time", que agrupe por Major_category y que ordene los resultados por "Share_full_time" en forma ascendente

In [79]:
pd.read_sql_query("""
SELECT Major_category, CAST(Full_time AS Float)/CAST(Employed AS Float) "Share_full_time" FROM recent_grads
GROUP BY Major_category
ORDER BY Share_full_time;
""", con)

Unnamed: 0,Major_category,Share_full_time
0,Health,0.671135
1,Humanities & Liberal Arts,0.672358
2,Arts,0.695341
3,Physical Sciences,0.774993
4,Education,0.799191
5,Communications & Journalism,0.800577
6,Social Science,0.801916
7,Biology & Life Science,0.80572
8,Computers & Mathematics,0.810677
9,Interdisciplinary,0.817839


Ahora, intentemos resolver la pregunta: ¿Cuáles valores están por encima del promedio de la columna ShareWomen?

Si escribimos

In [28]:
pd.read_sql_query("""
SELECT * FROM recent_grads
WHERE ShareWomen > AVG(ShareWomen);
""", con)

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))



DatabaseError: Execution failed on sql '
SELECT * FROM recent_grads
WHERE ShareWomen > AVG(ShareWomen);
': misuse of aggregate function AVG()

obtenemos un error

La manera de poder escribir este tipo de expresiones es con **subquerys** que siempre debemos de escribir rodeadas de paréntesis (subquery)

Podríamos obtener el resultado deseado escribiendo

In [81]:
pd.read_sql_query("""
SELECT * FROM recent_grads
WHERE ShareWomen > (SELECT AVG(ShareWomen) FROM recent_grads);
""", con)

Unnamed: 0,index,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,7,8,5001,ASTRONOMY AND ASTROPHYSICS,1792.0,832.0,960.0,Physical Sciences,0.535714,10,...,553,827,33,0.021167,62000,31500,109000,972,500,220
1,29,30,5402,PUBLIC POLICY,5978.0,2639.0,3339.0,Law & Public Policy,0.558548,55,...,1306,2776,670,0.128426,50000,35000,70000,1550,1871,340
2,34,35,6107,NURSING,209394.0,21773.0,187621.0,Health,0.896019,2554,...,40818,122817,8497,0.044863,48000,39000,58000,151643,26146,6193
3,39,40,5102,"NUCLEAR, INDUSTRIAL RADIOLOGY, AND BIOLOGICAL ...",2116.0,528.0,1588.0,Physical Sciences,0.750473,31,...,579,1115,137,0.071540,46000,38000,53000,162,1475,124
4,40,41,6201,ACCOUNTING,198633.0,94519.0,104114.0,Business,0.524153,2042,...,27693,123169,12411,0.069749,45000,34000,56000,11417,39323,10886
5,44,45,6105,MEDICAL TECHNOLOGIES TECHNICIANS,15914.0,3916.0,11998.0,Health,0.753927,190,...,2665,9005,505,0.036983,45000,36000,50000,5546,7176,1002
6,46,47,3702,STATISTICS AND DECISION SCIENCE,6251.0,2960.0,3291.0,Computers & Mathematics,0.526476,37,...,1840,2151,401,0.086274,45000,26700,60000,2298,1200,343
7,48,49,3607,PHARMACOLOGY,1762.0,515.0,1247.0,Biology & Life Science,0.707719,3,...,532,565,107,0.085532,45000,40000,45000,603,478,93
8,49,50,5006,OCEANOGRAPHY,2418.0,752.0,1666.0,Physical Sciences,0.688999,36,...,379,1595,99,0.056995,44700,23000,50000,459,996,186
9,51,52,6104,MEDICAL ASSISTING SERVICES,11123.0,803.0,10320.0,Health,0.927807,67,...,4107,4290,407,0.042507,42000,30000,65000,2091,6948,1270


#### 19. Escribe una query que regrese las Majors que tienen una Unemployment_rate por debajo del promedio. Tu selección debe contener sólo las columnas Major y Unemployment_rate y tus resultados deben estar ordenados en orden ascendente por Unemployment_rate

In [82]:
pd.read_sql_query("""
SELECT Major, Unemployment_rate FROM recent_grads
WHERE Unemployment_rate < (SELECT AVG(Unemployment_rate) FROM recent_grads)
ORDER BY Unemployment_rate ASC;
""", con)

Unnamed: 0,Major,Unemployment_rate
0,MATHEMATICS AND COMPUTER SCIENCE,0.000000
1,MILITARY TECHNOLOGIES,0.000000
2,BOTANY,0.000000
3,SOIL SCIENCE,0.000000
4,EDUCATIONAL ADMINISTRATION AND SUPERVISION,0.000000
5,ENGINEERING MECHANICS PHYSICS AND SCIENCE,0.006334
6,COURT REPORTING,0.011690
7,MATHEMATICS TEACHER EDUCATION,0.016203
8,PETROLEUM ENGINEERING,0.018381
9,GENERAL AGRICULTURE,0.019642


El operador IN en SQL sirve para utilizar un WHERE sobre un conjunto de categorías. Por ejemplo, que la categoría pudiera ser 'Business' o 'Engineering'

In [83]:
pd.read_sql_query("""
SELECT Major, Major_category FROM recent_grads
WHERE Major_category IN ('Business', 'Engineering')
LIMIT 7;
""", con)

Unnamed: 0,Major,Major_category
0,PETROLEUM ENGINEERING,Engineering
1,MINING AND MINERAL ENGINEERING,Engineering
2,METALLURGICAL ENGINEERING,Engineering
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering
4,CHEMICAL ENGINEERING,Engineering
5,NUCLEAR ENGINEERING,Engineering
6,ACTUARIAL SCIENCE,Business


Asimismo, podemos seleccionar aquellas categorías de Majors con más egresados

In [84]:
pd.read_sql_query("""
SELECT Major_category FROM recent_grads
GROUP BY Major_category
ORDER BY SUM(Total) DESC
LIMIT 5;
""", con)

Unnamed: 0,Major_category
0,Business
1,Humanities & Liberal Arts
2,Education
3,Engineering
4,Social Science


Uniendo nuestras dos querys anteriores podremos seleccionar aquellas Major_categories que se encuentren dentro de esta lista

In [86]:
pd.read_sql_query("""
SELECT Major, Major_category FROM recent_grads
WHERE Major_category IN 
(SELECT Major_category FROM recent_grads
GROUP BY Major_category
ORDER BY SUM(Total)
DESC LIMIT 5)
""", con)

Unnamed: 0,Major,Major_category
0,PETROLEUM ENGINEERING,Engineering
1,MINING AND MINERAL ENGINEERING,Engineering
2,METALLURGICAL ENGINEERING,Engineering
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering
4,CHEMICAL ENGINEERING,Engineering
5,NUCLEAR ENGINEERING,Engineering
6,ACTUARIAL SCIENCE,Business
7,MECHANICAL ENGINEERING,Engineering
8,ELECTRICAL ENGINEERING,Engineering
9,COMPUTER ENGINEERING,Engineering


Calculemos ahora una query que nos regrese la muestra promedio (avg_ratio) de todas las majors

In [87]:
pd.read_sql_query("""
SELECT AVG(CAST(Sample_size as float)/CAST(Total as Float)) avg_ratio FROM recent_grads
""", con)

Unnamed: 0,avg_ratio
0,0.009091


#### 20. Escribe una query que seleccione las columnas Major, Major_category y la columna calculada ratio, que filtre las filas donde el ratio sea mayor que el avg_ratio. Tip: utiliza la subquery de arriba

In [88]:
pd.read_sql_query("""
SELECT Major, Major_category, cast(Sample_size as float)/cast(Total as float) ratio FROM recent_grads
WHERE ratio>(select AVG(cast(Sample_size as float)/cast(Total as float)) avg_ratio from recent_grads)""", con)

Unnamed: 0,Major,Major_category,ratio
0,PETROLEUM ENGINEERING,Engineering,0.015391
1,MINING AND MINERAL ENGINEERING,Engineering,0.009259
2,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,0.012719
3,ACTUARIAL SCIENCE,Business,0.013503
4,MECHANICAL ENGINEERING,Engineering,0.011280
5,COMPUTER ENGINEERING,Engineering,0.009605
6,AEROSPACE ENGINEERING,Engineering,0.009762
7,INDUSTRIAL AND MANUFACTURING ENGINEERING,Engineering,0.009648
8,ARCHITECTURAL ENGINEERING,Engineering,0.009204
9,COURT REPORTING,Law & Public Policy,0.012195


In [89]:
con.close()