# Python Import Data CheatSheet

## Importing Data in Python

In [1]:
import numpy as np
import pandas as pd

## Help

In [6]:
print("np.info")
np.info(np.ndarray.dtype)
print("\npd.read_csv")
help(pd.read_csv)

np.info
Data-type of the array's elements.

Parameters
----------
None

Returns
-------
d : numpy dtype object

See Also
--------
numpy.dtype

Examples
--------
>>> x
array([[0, 1],
       [2, 3]])
>>> x.dtype
dtype('int32')
>>> type(x.dtype)
<type 'numpy.dtype'>

pd.read_csv
Help on function read_csv in module pandas.io.parsers:

read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dia

## Text Files

### Plain Text Files

In [18]:
filename = '../sample_files/data/huck_finn.txt'
file = open(filename, mode='r')  # Open the file for reading
text = file.read()               # Read a files content
print(file.closed)               # Check whether file is closed
file.close()                     # Close file
print(text)

# Using context manager `with`
with open(filename, 'r') as file:
  print(file.readline())         # Read a single line
  print(file.readline())         # Read a single line
  print(file.readline())         # Read a single line

False
﻿

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
by Mark Twain (Samuel Clemens)

This eBook is for the use of anyone anywhere at no cost and with almost
no restrictions whatsoever. You may copy it, give it away or re-use
it under the terms of the Project Gutenberg License included with this
eBook or online at www.gutenberg.net

Title: Adventures of Huckleberry Finn, Complete

Author: Mark Twain (Samuel Clemens)

Release Date: August 20, 2006 [EBook #76]

Last Updated: October 20, 2012]

Language: English


*** START OF THIS PROJECT GUTENBERG EBOOK HUCKLEBERRY FINN ***

Produced by David Widger





ADVENTURES

OF

HUCKLEBERRY FINN

(Tom Sawyer's Comrade)

By Mark Twain

Complete




CONTENTS.

CHAPTER I. Civilizing Huck.Miss Watson.Tom Sawyer Waits.

CHAPTER II. The Boys Escape Jim.Torn Sawyer's Gang.Deep-laid Plans.

CHAPTER III. A Good Going-over.Grace Triumphant."One of Tom Sawyers's
Lies".

CHAPTER IV. Huck and the Judge.Superstition.

CHAPTER

### Table Data: Flat Files

#### Importing Flat Files with numpy

In [2]:
# Files with one data type
filename = "../sample_files/data/mnist.txt"
data = np.loadtxt(filename,
                  delimiter=' ', # String used to separate values
                  skiprows=2,    # Skip the first 2 lines
                  usecols=[1,2], # Read the 1st and 3rd column
                  dtype=str)     # The type of the resulting array

print(data)

[['0' '0']
 ['0' '0']
 ['0' '0']
 ...
 ['0' '0']
 ['0' '0']
 ['0' '0']]


In [3]:
# Files with mixed data types
filename = "../sample_files/data/titanic.csv"
data = np.genfromtxt(filename,
                     delimiter=',',
                     names=True,    # Look for column header
                     dtype=None,
                     encoding=None)

print(data)

[('"1"', '"1st"', '"Male"', '"Child"', '"No"',   0)
 ('"2"', '"2nd"', '"Male"', '"Child"', '"No"',   0)
 ('"3"', '"3rd"', '"Male"', '"Child"', '"No"',  35)
 ('"4"', '"Crew"', '"Male"', '"Child"', '"No"',   0)
 ('"5"', '"1st"', '"Female"', '"Child"', '"No"',   0)
 ('"6"', '"2nd"', '"Female"', '"Child"', '"No"',   0)
 ('"7"', '"3rd"', '"Female"', '"Child"', '"No"',  17)
 ('"8"', '"Crew"', '"Female"', '"Child"', '"No"',   0)
 ('"9"', '"1st"', '"Male"', '"Adult"', '"No"', 118)
 ('"10"', '"2nd"', '"Male"', '"Adult"', '"No"', 154)
 ('"11"', '"3rd"', '"Male"', '"Adult"', '"No"', 387)
 ('"12"', '"Crew"', '"Male"', '"Adult"', '"No"', 670)
 ('"13"', '"1st"', '"Female"', '"Adult"', '"No"',   4)
 ('"14"', '"2nd"', '"Female"', '"Adult"', '"No"',  13)
 ('"15"', '"3rd"', '"Female"', '"Adult"', '"No"',  89)
 ('"16"', '"Crew"', '"Female"', '"Adult"', '"No"',   3)
 ('"17"', '"1st"', '"Male"', '"Child"', '"Yes"',   5)
 ('"18"', '"2nd"', '"Male"', '"Child"', '"Yes"',  11)
 ('"19"', '"3rd"', '"Male"', '"Ch

In [46]:
# The default dtype of the np.recfromcsv() function is None.
data_array = np.recfromcsv(filename, encoding="utf-8")
print(data_array)

[('"1"', '"1st"', '"Male"', '"Child"', '"No"',   0)
 ('"2"', '"2nd"', '"Male"', '"Child"', '"No"',   0)
 ('"3"', '"3rd"', '"Male"', '"Child"', '"No"',  35)
 ('"4"', '"Crew"', '"Male"', '"Child"', '"No"',   0)
 ('"5"', '"1st"', '"Female"', '"Child"', '"No"',   0)
 ('"6"', '"2nd"', '"Female"', '"Child"', '"No"',   0)
 ('"7"', '"3rd"', '"Female"', '"Child"', '"No"',  17)
 ('"8"', '"Crew"', '"Female"', '"Child"', '"No"',   0)
 ('"9"', '"1st"', '"Male"', '"Adult"', '"No"', 118)
 ('"10"', '"2nd"', '"Male"', '"Adult"', '"No"', 154)
 ('"11"', '"3rd"', '"Male"', '"Adult"', '"No"', 387)
 ('"12"', '"Crew"', '"Male"', '"Adult"', '"No"', 670)
 ('"13"', '"1st"', '"Female"', '"Adult"', '"No"',   4)
 ('"14"', '"2nd"', '"Female"', '"Adult"', '"No"',  13)
 ('"15"', '"3rd"', '"Female"', '"Adult"', '"No"',  89)
 ('"16"', '"Crew"', '"Female"', '"Adult"', '"No"',   3)
 ('"17"', '"1st"', '"Male"', '"Child"', '"Yes"',   5)
 ('"18"', '"2nd"', '"Male"', '"Child"', '"Yes"',  11)
 ('"19"', '"3rd"', '"Male"', '"Ch

#### Importing Flat Files with pandas

In [58]:
filename = "../sample_files/data/winequality-red.csv"
data = pd.read_csv(filename,
                   nrows=5,        # Number of rows of file to read
                   header=0,       # Row number to use as col names
                   sep='\t',       # Delimiter to use
                   comment='#',    # Character to split comments
                   na_values=[""]) # String to recognize as NA/NaN

print(data)

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

## Excel Spreadsheets

In [15]:
file = '../sample_files/data/urbanpop.xlsx'

data = pd.ExcelFile(file)
df_sheet1 = data.parse('1960-1966',
                       skiprows=[0],
                       names=['Country', '1960', '1961', '1962', '1963', '1964', '1965', '1966'])
print(df_sheet1)

df_sheet2 = data.parse(0,
                       parse_cols=[0],
                       skiprows=[0],
                       names=None)
print(df_sheet2)


                      Country         1960          1961          1962  \
0                     Albania     494443.0  5.118028e+05  5.294389e+05   
1                     Algeria    3293999.0  3.515148e+06  3.739963e+06   
2              American Samoa          NaN  1.366030e+04  1.416580e+04   
3                     Andorra          NaN  8.723921e+03  9.700346e+03   
4                      Angola     521205.0  5.482650e+05  5.796954e+05   
5         Antigua and Barbuda      21699.0  2.163505e+04  2.166420e+04   
6                   Argentina   15224096.0  1.554522e+07  1.591212e+07   
7                     Armenia     957974.0  1.008597e+06  1.061426e+06   
8                       Aruba      24996.0  2.813976e+04  2.853273e+04   
9                   Australia    8375329.0  8.587695e+06  8.841891e+06   
10                    Austria    4560057.0  4.590377e+06  4.622747e+06   
11                 Azerbaijan    1857673.0  2.108328e+06  2.167604e+06   
12                    Bahamas      654

In [14]:
# To access the sheet names, use the sheet_names attribute
data.sheet_names

['1960-1966', '1967-1974', '1975-2011']

## Relational Databases

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite://Northwind.sqlite')

In [None]:
# Use the table_names() method to fetch a list of table names:
table_names = engine.table_names()

### Query relational databases

In [None]:
con = engine.connect()
rs = con.execute("SELECT * FROM Orders")
df = pd.DataFrame(rs.fetchall())
df.columns = rs.keys()
con.close()

In [None]:
# Using the context manager `with`
with engine.connect() as con:
  rs = con.execute("SELECT OrderID FROM Orders")
  df = pd.DataFrame(rs.fetchmany(size=5))
  df.columns = rs.keys()

#### Querying relational databases with pandas

In [None]:
df = pd.read_sql_query("SELECT * FROM Orders", engine)

## Pickeled Files

In [None]:
import pickle
with open('pickled_fruit.pkl', 'rb') as file:
  pickled_data = pickle.load(file)

## HDF5 Files

In [None]:
import h5py
filename = 'H-H1_LOSC_4_v1-815411200-4096.hdf5'
data = h5py.File(filename, 'r')

## Matlab Files

In [None]:
import scipy.io
filename = 'workspace.mat'
mat = scipy.io.loadmat(filename)

## Exploring your Data

### Numpy Arrays

In [None]:
data_array.dtype  # Data type of array elements
data_array.shape  # Array dimensions
len(data_array)   # Length of array

### pandas DataFrames

In [None]:
df.head()                # Return first DataFrame rows
df.tail()                # Return last DataFrame rows
df.index                 # Describe index
df.columns               # Describe DataFrame columns
df.info()                # Info on DataFrame
data_array = data.values # Convert a DataFrame to an a NumPy array

### Dictionaries

#### Accessing Elements with Functions

In [None]:
print(mat.keys())       # Print dictionary keys
for key in data.keys(): # Print dictionary keys
  print(key)
pickled_data.values()   # Return dictionary values
print(mat.items())      # Returns items in list format of (key, value) tuple pairs

#### Accessing Data Items with Keys

In [None]:
for key in data ['meta'].keys()          # Explore the HDF5 structure
  print(key)
print(data['meta']['Description'].value) # Retrieve the value for a key