# Importing lotsa files

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import glob

In [3]:
# Go into largest cities and load everything
filenames = glob.glob('16-classwork/largest-cities/*.csv')
filenames

['16-classwork/largest-cities/india.csv',
 '16-classwork/largest-cities/germany.csv',
 '16-classwork/largest-cities/usa.csv',
 '16-classwork/largest-cities/russia.csv']

In [5]:
# loop through all the filenames and load them
dataframes = []

for filename in filenames:
  df = pd.read_csv(filename)
  df['filename'] = filename
  dataframes.append(df)
  
everything = pd.concat(dataframes, ignore_index=True)
everything

Unnamed: 0,city,population,filename
0,Mumbai (Bombay),16368000,16-classwork/largest-cities/india.csv
1,Kolkata (Calcutta),13217000,16-classwork/largest-cities/india.csv
2,Delhi,12791000,16-classwork/largest-cities/india.csv
3,Chennai,6425000,16-classwork/largest-cities/india.csv
4,Berlin,3426354,16-classwork/largest-cities/germany.csv
5,Hamburg,1739117,16-classwork/largest-cities/germany.csv
6,Munich,1260391,16-classwork/largest-cities/germany.csv
7,Cologne,963395,16-classwork/largest-cities/germany.csv
8,New York City,8550405,16-classwork/largest-cities/usa.csv
9,Los Angeles,3971883,16-classwork/largest-cities/usa.csv


# Dealing with Excel Files

In [6]:
pd.read_excel('16-classwork/visadata.xls', sheet_name='FY97')

Unnamed: 0,Fiscal Year 1997,A-1,A-2,A-3,B-1,"B-1,2",B-2,"B-1,2/BCC","B-1,2/BCV",C-1,...,U-1,U-2,U-3,U-4,V-1,V-2,V-3,Total Visas,BCC,Grand Total
0,Africa,,,,,,,,,,...,,,,,,,,,,
1,Algeria,62.0,23.0,5.0,1661.0,507.0,3430.0,0.0,0.0,57.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6386.0,492.0,6878.0
2,Angola,54.0,169.0,3.0,10.0,1421.0,271.0,1.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2397.0,4.0,2401.0
3,Benin,10.0,43.0,4.0,104.0,375.0,249.0,0.0,0.0,37.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1105.0,10.0,1115.0
4,Botswana,10.0,79.0,1.0,2.0,366.0,54.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,850.0,0.0,850.0
5,Burkina Faso,19.0,7.0,1.0,12.0,487.0,309.0,1.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1192.0,4.0,1196.0
6,Burundi,5.0,2.0,0.0,57.0,97.0,60.0,3.0,0.0,49.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,387.0,74.0,461.0
7,Cameroon,44.0,52.0,11.0,242.0,768.0,1128.0,0.0,0.0,53.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2960.0,10.0,2970.0
8,Cape Verde,3.0,23.0,2.0,17.0,3273.0,763.0,0.0,0.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4261.0,1.0,4262.0
9,Central African Republic,7.0,5.0,0.0,7.0,69.0,33.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,208.0,2.0,210.0


In [7]:
# reading sheets in an Excel file
xls = pd.ExcelFile('16-classwork/visadata.xls')
sheet_names = xls.sheet_names

In [8]:
dataframes = []

for sheet in sheet_names:
  # Slow version:
  # df = pd.read_excel('16-classwork/visadata.xls', sheet_name=sheet)
  
  # Fast version
  df = xls.parse(sheet)
  df['sheet'] = sheet
  dataframes.append(df)
  
everything = pd.concat(dataframes, ignore_index=True, sort=True)
everything

Unnamed: 0,A-1,A-2,A-3,B-1,"B-1,2","B-1,2/BCC","B-1,2/BCV",B-2,BCC,C-1,...,Total Visas,U-1,U-2,U-3,U-4,U-5,V-1,V-2,V-3,sheet
0,,,,,,,,,,,...,,,,,,,,,,FY97
1,62.0,23.0,5.0,1661.0,507.0,0.0,0.0,3430.0,492.0,57.0,...,6386.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
2,54.0,169.0,3.0,10.0,1421.0,1.0,0.0,271.0,4.0,10.0,...,2397.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
3,10.0,43.0,4.0,104.0,375.0,0.0,0.0,249.0,10.0,37.0,...,1105.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
4,10.0,79.0,1.0,2.0,366.0,0.0,0.0,54.0,0.0,3.0,...,850.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
5,19.0,7.0,1.0,12.0,487.0,1.0,0.0,309.0,4.0,24.0,...,1192.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
6,5.0,2.0,0.0,57.0,97.0,3.0,0.0,60.0,74.0,49.0,...,387.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
7,44.0,52.0,11.0,242.0,768.0,0.0,0.0,1128.0,10.0,53.0,...,2960.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
8,3.0,23.0,2.0,17.0,3273.0,0.0,0.0,763.0,1.0,16.0,...,4261.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
9,7.0,5.0,0.0,7.0,69.0,0.0,0.0,33.0,2.0,5.0,...,208.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,FY97
