## Loading External Data into Pandas

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt

from pandas import DataFrame, Series

In [3]:
# read from a csv into dataframe
df = pd.read_csv('data_sources/ex1.csv', sep=',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
names = ['data_a','data_b','data_c','data_d','Info']
df = pd.read_csv('data_sources/ex2.csv', sep=',', names=names, index_col='Info')
df

Unnamed: 0_level_0,data_a,data_b,data_c,data_d
Info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [7]:
# selectively skip rows[0] [2] and [3]
df = pd.read_csv('data_sources/ex4.csv', sep=',', skiprows=[0,2,3])
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [9]:
# while reading from a source, replace NA and missing values with 0
df = pd.read_csv('data_sources/ex5.csv')
df.fillna(0)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,0
1,two,5,6,0.0,8,world
2,three,9,10,11.0,12,foo


In [11]:
### now reading some large data sets (10,000 rows)
df = pd.read_csv('data_sources/ex6.csv')
df # for large data, default shows first 5 (head) and last 5 (tail)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [13]:
df.describe()

Unnamed: 0,one,two,three,four
count,10000.0,10000.0,10000.0,10000.0
mean,0.04575,0.000871,-0.026463,0.015985
std,0.948825,1.003829,1.037273,0.982409
min,-3.726864,-3.465356,-3.234391,-3.173509
25%,-0.618617,-0.706643,-0.727791,-0.676291
50%,0.041638,0.018972,-0.03234,-0.005338
75%,0.701536,0.708405,0.626904,0.659369
max,2.833891,2.946737,3.053345,3.412734


In [15]:
df = pd.read_csv('data_sources/ex6.csv', nrows=40) # reads top 40 rows
df

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.63783,2.172201,G


In [23]:
chunks = pd.read_csv('data_sources/ex6.csv', chunksize=100) # reads data in chunks
alldata = pd.Series([], dtype='float64') # start off with an empty series
for piece in chunks:
    alldata = alldata.add(piece['key'].value_counts(), fill_value=0) # counts instances of values in column key
alldata

0    151.0
1    146.0
2    152.0
3    162.0
4    171.0
5    157.0
6    166.0
7    164.0
8    162.0
9    150.0
A    320.0
B    302.0
C    286.0
D    320.0
E    368.0
F    335.0
G    308.0
H    330.0
I    327.0
J    337.0
K    334.0
L    346.0
M    338.0
N    306.0
O    343.0
P    324.0
Q    340.0
R    318.0
S    308.0
T    304.0
U    326.0
V    328.0
W    305.0
X    364.0
Y    314.0
Z    288.0
dtype: float64

## Reading excel and writing data out again

In [26]:
x_data = pd.read_excel('data_sources/ex1.xlsx')
x_data # notice the excel-index column Unnamed:0

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [28]:
x_data.to_csv('data_sources/out.csv', sep='-')

In [29]:
import json

In [35]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
#type(obj) # str
r = json.loads(obj)
#type(r) # dictionary
s = pd.Series(r)
s.to_json('data_sources/wes.json')

### We can use API endpoints

In [36]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
res = requests.get(url)
data = res.json() # structure of a list of dictionaries
data

[{'url': 'https://api.github.com/repos/pandas-dev/pandas/issues/44041',
  'repository_url': 'https://api.github.com/repos/pandas-dev/pandas',
  'labels_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/44041/labels{/name}',
  'comments_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/44041/comments',
  'events_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/44041/events',
  'html_url': 'https://github.com/pandas-dev/pandas/pull/44041',
  'id': 1027024981,
  'node_id': 'PR_kwDOAA0YD84tO2Ht',
  'number': 44041,
  'title': 'Fixed metadata propagation in Dataframe.apply (issue #28283)',
  'user': {'login': 'moha-rk',
   'id': 56267277,
   'node_id': 'MDQ6VXNlcjU2MjY3Mjc3',
   'avatar_url': 'https://avatars.githubusercontent.com/u/56267277?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/moha-rk',
   'html_url': 'https://github.com/moha-rk',
   'followers_url': 'https://api.github.com/users/moha-rk/followers',
   'following_url': 'https

In [39]:
issues = pd.DataFrame(data, columns=['number','title','state','comments'])
issues

Unnamed: 0,number,title,state,comments
0,44041,Fixed metadata propagation in Dataframe.apply ...,open,0
1,44039,REF: dispatch DTI/TDI setops to RangeIndex,open,0
2,44038,TST: added groupby apply test for nan coerce,open,0
3,44037,DOC: Document and annotate Index.reindex (#403...,open,0
4,44035,TST: adds test for .loc on multiindex for seri...,open,0
5,44034,CLN: no need for suffices anymore in test_hash...,open,0
6,44033,Issue38947,open,2
7,44032,[PERF] fixing memory leak in aggregation.pyx,open,0
8,44031,"BUG: apply swallows exceptions, shows inconsis...",open,0
9,44030,BUG: .fillna({}) doesn't work,open,2
