# Missing Data

In [None]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [None]:
data = Series(['one', 'two', np.nan, 'four'])
data

In [None]:
data.isnull()

In [None]:
data.dropna()

In [None]:
dframe = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, np.nan, 9], [np.nan, np.nan, np.nan]])
dframe

In [None]:
dframe.isnull()

In [None]:
dframe.isnull().all()
#For each feature, we can know if data have missing values for that feature

In [None]:
dframe.isnull().all(axis = 1)
#For each record, we can check if it has missing value or not

In [None]:
clean_dframe = dframe.dropna()
#drop each row if there is missing value in it
clean_dframe

In [None]:
dframe.dropna(how = 'all')

In [None]:
dframe.dropna(axis = 1)
#drop each column if there is missing value in it

In [None]:
npn = np.nan
dframe2 = DataFrame([[1, 2, 3, npn], [2, npn, 5, 6], [npn, 7, npn, 9], [1, npn, npn, npn]], columns = list('ABCD'))
dframe2

In [None]:
dframe2.dropna(thresh=2)
#keep the rows with at least 2 non-null values 
#dframe2.dropna(thresh = 2, inplace = True) if you want to change dframe2 without assignment

In [None]:
dframe2

In [None]:
dframe2.fillna(1)

In [None]:
dframe2.fillna({'A':0, 'B':1, 'C':2, 'D':3})
#fill missing values of each column differently
#keys of the dictionary are columns and values are for filling missing values
#you can use parameter inplace to change the table in place

In [None]:
dframe2.mean()
#type(dframe2.mean())

In [None]:
dframe2.fillna(dframe2.mean())
#fill missing value of each column by mean value of that column

# Index Hierarchy

In [None]:
from numpy.random import randn

In [None]:
mySer = Series(randn(6), index = [[1, 1, 1, 2, 2, 2], ['a', 'b', 'c', 'a', 'b', 'c']])
mySer

In [None]:
mySer.index

In [None]:
mySer[1]

In [None]:
mySer[:,'a']

In [None]:
dframe = mySer.unstack()
dframe

In [None]:
dframe2 = DataFrame(np.arange(16).reshape(4, 4), index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]]
                  , columns = [['Qom', 'Qom', 'Tehran', 'Rasht'], ['day', 'night', 'night', 'day']])
dframe2

In [None]:
dframe2.index.names = ['Index_1', 'Index_2']
dframe2.columns.names = ['Cities', 'Time']
dframe2

In [None]:
dframe2.swaplevel('Cities', 'Time', axis = 1)

In [None]:
dframe2.sortlevel(1)

In [None]:
dframe2.sortlevel(0, axis = 1)

In [None]:
dframe2.sum(level = 'Time', axis = 1)

In [None]:
dframe2

In [None]:
dframe2.min(level = 'Index_2', axis = 0)

# Reading and writing Text Files

In [None]:
dframe = pd.read_csv("testData.csv")
dframe

In [None]:
dframe = pd.read_csv("testData.csv", header = None)
dframe.columns = ["Age", "Toefl", "Degree", "UniRank"]
dframe

In [None]:
dframe2 = pd.read_table("testData.csv", sep=',', header = None)
dframe2

In [None]:
#We can read some rows if the file is too large
pd.read_csv("testData.csv", header = None, nrows = 2)

In [None]:
dframe.to_csv("testDataOut.csv")

In [None]:
import sys
dframe.to_csv(sys.stdout)
#Just show the way data will be saved

In [None]:
dframe.to_csv(sys.stdout, sep = "-")

In [None]:
dframe.to_csv(sys.stdout, columns = ["Age", "Degree"])

# Read json, html, Excel

In [None]:
import codecs
dframe = pd.read_json(codecs.open('0_1000.json', 'r', 'utf-8'))
dframe.head(n = 1)

In [None]:
#run this in anaconda prompt
#pip install html5lib

In [None]:
url = "https://fdic.gov/bank/individual/failed/banklist.html"
dframe_list = pd.io.html.read_html(url)
dframe = dframe_list[0]
dframe.head(n = 2)

In [None]:
data = pd.read_excel("Book1.xls")
data