In [1]:
import requests # pull, extract data from APIs
import pandas as pd # Data manilulation, transformation
from sqlalchemy import create_engine # create connection to db

In [2]:
# Extract data from API source 
def extract()-> dict:
    """ This API extracts data from
    http://universities.hipolabs.com
    """
    API_URL = "http://universities.hipolabs.com/search?country=United+States"
    data = requests.get(API_URL).json()
    return data

In [3]:
# Transform data 
def transform(data:dict) -> pd.DataFrame:
    """ Transforms the dataset into desired structure and filters"""
    df = pd.DataFrame(data)
    print(f"Total Number of universities from API {len(data)}")
    df = df[df["name"].str.contains("California")]
    print(f"Number of universities in california {len(df)}")
    df['domains'] = [','.join(map(str, l)) for l in df['domains']]
    df['web_pages'] = [','.join(map(str, l)) for l in df['web_pages']]
    df = df.reset_index(drop=True)
    return df[["domains","country","web_pages","name"]]

In [4]:
# Load data
def load(df:pd.DataFrame)-> None:
    """ Loads data into a sqllite database"""
    disk_engine = create_engine('sqlite:///my_lite_store.db')
    df.to_sql('cal_uni', disk_engine, if_exists='replace')

In [5]:
data = extract()

In [33]:
data

[{'alpha_two_code': 'US',
  'name': 'Marywood University',
  'domains': ['marywood.edu'],
  'web_pages': ['http://www.marywood.edu'],
  'country': 'United States',
  'state-province': None},
 {'alpha_two_code': 'US',
  'name': 'Lindenwood University',
  'domains': ['lindenwood.edu'],
  'web_pages': ['http://www.lindenwood.edu/'],
  'country': 'United States',
  'state-province': None},
 {'alpha_two_code': 'US',
  'name': 'Sullivan University',
  'domains': ['sullivan.edu'],
  'web_pages': ['https://sullivan.edu/'],
  'country': 'United States',
  'state-province': None},
 {'alpha_two_code': 'US',
  'name': 'Florida State College at Jacksonville',
  'domains': ['fscj.edu'],
  'web_pages': ['https://www.fscj.edu/'],
  'country': 'United States',
  'state-province': None},
 {'alpha_two_code': 'US',
  'name': 'Xavier University',
  'domains': ['xavier.edu'],
  'web_pages': ['https://www.xavier.edu/'],
  'country': 'United States',
  'state-province': None},
 {'alpha_two_code': 'US',
  'nam

In [6]:
df = transform(data)

Total Number of universities from API 2308
Number of universities in california 43


In [7]:
df

Unnamed: 0,domains,country,web_pages,name
0,csuci.edu,United States,https://www.csuci.edu/,California State University Channel Islands
1,calbaptist.edu,United States,http://www.calbaptist.edu/,California Baptist University
2,calsouthern.edu,United States,http://www.calsouthern.edu/,California Southern University
3,cpp.edu,United States,http://www.cpp.edu/,"California Polytechnic State University, Pomona"
4,calcoast.edu,United States,http://www.calcoast.edu/,California Coast University
5,caltech.edu,United States,http://www.caltech.edu/,California Institute of Technology
6,callutheran.edu,United States,http://www.callutheran.edu/,California Lutheran University
7,csum.edu,United States,http://www.csum.edu/,California Maritime Academy
8,cnuas.edu,United States,http://www.cnuas.edu/,California National University
9,calpoly.edu,United States,http://www.calpoly.edu/,"California Polytechnic State University, San L..."


In [28]:
load(df)

In [8]:
# Transform data step by step
new_df = pd.DataFrame(data) # create dataframe

In [9]:
new_df

Unnamed: 0,alpha_two_code,web_pages,country,domains,name,state-province
0,US,[http://www.marywood.edu],United States,[marywood.edu],Marywood University,
1,US,[http://www.lindenwood.edu/],United States,[lindenwood.edu],Lindenwood University,
2,US,[https://sullivan.edu/],United States,[sullivan.edu],Sullivan University,
3,US,[https://www.fscj.edu/],United States,[fscj.edu],Florida State College at Jacksonville,
4,US,[https://www.xavier.edu/],United States,[xavier.edu],Xavier University,
...,...,...,...,...,...,...
2303,US,[https://www.uolivet.edu/],United States,[olivetcollege.edu],The University of Olivet,Michigan
2304,US,[https://westminsteru.edu/],United States,[westminsteru.edu],Westminster University,"Salt Lake City, Utah"
2305,US,[https://www.dom.edu/],United States,[dom.edu],Dominican University,Illinois
2306,US,[http://www.claremont.edu/],United States,[claremont.edu],Claremont Colleges,


In [10]:
# Filter university in California
new_df = new_df[new_df['name'].str.contains('California')]

In [11]:
new_df

Unnamed: 0,alpha_two_code,web_pages,country,domains,name,state-province
22,US,[https://www.csuci.edu/],United States,[csuci.edu],California State University Channel Islands,
60,US,[http://www.calbaptist.edu/],United States,[calbaptist.edu],California Baptist University,
72,US,[http://www.calsouthern.edu/],United States,[calsouthern.edu],California Southern University,
119,US,[http://www.cpp.edu/],United States,[cpp.edu],"California Polytechnic State University, Pomona",
222,US,[http://www.calcoast.edu/],United States,[calcoast.edu],California Coast University,
223,US,[http://www.caltech.edu/],United States,[caltech.edu],California Institute of Technology,
224,US,[http://www.callutheran.edu/],United States,[callutheran.edu],California Lutheran University,
225,US,[http://www.csum.edu/],United States,[csum.edu],California Maritime Academy,
226,US,[http://www.cnuas.edu/],United States,[cnuas.edu],California National University,
227,US,[http://www.calpoly.edu/],United States,[calpoly.edu],"California Polytechnic State University, San L...",


In [12]:
new_df['domains'] = [''.join(map(str, l)) for l in df['domains']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['domains'] = [''.join(map(str, l)) for l in df['domains']]


In [13]:
new_df

Unnamed: 0,alpha_two_code,web_pages,country,domains,name,state-province
22,US,[https://www.csuci.edu/],United States,csuci.edu,California State University Channel Islands,
60,US,[http://www.calbaptist.edu/],United States,calbaptist.edu,California Baptist University,
72,US,[http://www.calsouthern.edu/],United States,calsouthern.edu,California Southern University,
119,US,[http://www.cpp.edu/],United States,cpp.edu,"California Polytechnic State University, Pomona",
222,US,[http://www.calcoast.edu/],United States,calcoast.edu,California Coast University,
223,US,[http://www.caltech.edu/],United States,caltech.edu,California Institute of Technology,
224,US,[http://www.callutheran.edu/],United States,callutheran.edu,California Lutheran University,
225,US,[http://www.csum.edu/],United States,csum.edu,California Maritime Academy,
226,US,[http://www.cnuas.edu/],United States,cnuas.edu,California National University,
227,US,[http://www.calpoly.edu/],United States,calpoly.edu,"California Polytechnic State University, San L...",


In [14]:
new_df['web_pages'] = [''.join(map(str, l)) for l in df['web_pages']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['web_pages'] = [''.join(map(str, l)) for l in df['web_pages']]


In [15]:
new_df

Unnamed: 0,alpha_two_code,web_pages,country,domains,name,state-province
22,US,https://www.csuci.edu/,United States,csuci.edu,California State University Channel Islands,
60,US,http://www.calbaptist.edu/,United States,calbaptist.edu,California Baptist University,
72,US,http://www.calsouthern.edu/,United States,calsouthern.edu,California Southern University,
119,US,http://www.cpp.edu/,United States,cpp.edu,"California Polytechnic State University, Pomona",
222,US,http://www.calcoast.edu/,United States,calcoast.edu,California Coast University,
223,US,http://www.caltech.edu/,United States,caltech.edu,California Institute of Technology,
224,US,http://www.callutheran.edu/,United States,callutheran.edu,California Lutheran University,
225,US,http://www.csum.edu/,United States,csum.edu,California Maritime Academy,
226,US,http://www.cnuas.edu/,United States,cnuas.edu,California National University,
227,US,http://www.calpoly.edu/,United States,calpoly.edu,"California Polytechnic State University, San L...",


In [16]:
new_df = new_df.reset_index(drop=True)

In [17]:
new_df

Unnamed: 0,alpha_two_code,web_pages,country,domains,name,state-province
0,US,https://www.csuci.edu/,United States,csuci.edu,California State University Channel Islands,
1,US,http://www.calbaptist.edu/,United States,calbaptist.edu,California Baptist University,
2,US,http://www.calsouthern.edu/,United States,calsouthern.edu,California Southern University,
3,US,http://www.cpp.edu/,United States,cpp.edu,"California Polytechnic State University, Pomona",
4,US,http://www.calcoast.edu/,United States,calcoast.edu,California Coast University,
5,US,http://www.caltech.edu/,United States,caltech.edu,California Institute of Technology,
6,US,http://www.callutheran.edu/,United States,callutheran.edu,California Lutheran University,
7,US,http://www.csum.edu/,United States,csum.edu,California Maritime Academy,
8,US,http://www.cnuas.edu/,United States,cnuas.edu,California National University,
9,US,http://www.calpoly.edu/,United States,calpoly.edu,"California Polytechnic State University, San L...",
