# Importing Data with Pandas


Pandas makes it straightforward to import data in a variety of different formats. 

See the [documentation](https://pandas.pydata.org/pandas-docs/stable/api.html#flat-file) for detailed information


In [3]:
import pandas as pd

### Create some Data

In [14]:
data = {'first': ['James', 'Jane', 'Adam', 'Sara', 'Tom', 'Carol'], 
        'last': ['Smith', 'Watson', 'Miller', 'Thompson', 'Piper', 'Winters'], 
        'age': [18, 18, 19, 19, 20, 20], 
        'height_cm': [75.7, 163, 176.5, 163.3, 168.5, 177.5],
        'weight_kg': [66.9, 56.7, 68.9, 58, 57.5, 63.4],
        'income':['1,000', 800, 350, 980, '2,500', '2,950']}

# create a DataFrame
df = pd.DataFrame(data, columns = ['first', 'last', 'age', 'height_cm', 'weight_kg', 'income'])

df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18,75.7,66.9,1000
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500
5,Carol,Winters,20,177.5,63.4,2950


# Saving Data
- We will save the data in different formats, then re-import using 3 different methods.

In [15]:
# save to CSV
df.to_csv('people_data.csv', index=False)

# save to TSV
# supply \t as separator
df.to_csv('people_data.tsv', index=False, sep='\t')

# save to EXCEL
df.to_excel('people_data.xlsx', index=False)

# Importing Data

### Load CSV
- using <code>.read_csv()</code>

In [16]:
df = pd.read_csv('people_data.csv')
df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18,75.7,66.9,1000
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500
5,Carol,Winters,20,177.5,63.4,2950


### Load CSV with no header

In [17]:
df = pd.read_csv('people_data.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,first,last,age,height_cm,weight_kg,income
1,James,Smith,18,75.7,66.9,1000
2,Jane,Watson,18,163.0,56.7,800
3,Adam,Miller,19,176.5,68.9,350
4,Sara,Thompson,19,163.3,58.0,980
5,Tom,Piper,20,168.5,57.5,2500
6,Carol,Winters,20,177.5,63.4,2950


### Load CSV with specific header

In [18]:
df = pd.read_csv('people_data.csv', names=['first_name', 'last_name', 'age', 'height_in_cm', 'weight_in_kg', 'monthly_income'])
df

Unnamed: 0,first_name,last_name,age,height_in_cm,weight_in_kg,monthly_income
0,first,last,age,height_cm,weight_kg,income
1,James,Smith,18,75.7,66.9,1000
2,Jane,Watson,18,163.0,56.7,800
3,Adam,Miller,19,176.5,68.9,350
4,Sara,Thompson,19,163.3,58.0,980
5,Tom,Piper,20,168.5,57.5,2500
6,Carol,Winters,20,177.5,63.4,2950


### Load CSV - skip first 3 rows

In [19]:
df = pd.read_csv('people_data.csv', skiprows=3)
df

Unnamed: 0,Adam,Miller,19,176.5,68.9,350
0,Sara,Thompson,19,163.3,58.0,980
1,Tom,Piper,20,168.5,57.5,2500
2,Carol,Winters,20,177.5,63.4,2950


### Load CSV - specify thousands seperator

In [21]:
df = pd.read_csv('people_data.csv', thousands=',')
df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18,75.7,66.9,1000
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500
5,Carol,Winters,20,177.5,63.4,2950


### Load TSV
- using <code>.read_csv()</code>

In [23]:
df = pd.read_excel('people_data.xlsx', sep='\t')
df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18,75.7,66.9,1000
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500
5,Carol,Winters,20,177.5,63.4,2950


### Load EXCEL
- using <code>.read_excel()</code>

In [24]:
df = pd.read_excel('people_data.xlsx')
df

Unnamed: 0,first,last,age,height_cm,weight_kg,income
0,James,Smith,18,75.7,66.9,1000
1,Jane,Watson,18,163.0,56.7,800
2,Adam,Miller,19,176.5,68.9,350
3,Sara,Thompson,19,163.3,58.0,980
4,Tom,Piper,20,168.5,57.5,2500
5,Carol,Winters,20,177.5,63.4,2950
