In [2]:
import pandas as pd
import numpy as np

### Importing data

In [3]:
#Import the whole dataset in one go
data_frame = pd.read_csv('flower.csv',delimiter=',',header=0)
data_frame.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
#Reindex the data
new_index = np.arange(20,len(data_frame)+20)
data_frame.index = new_index 
data_frame.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
20,5.1,3.5,1.4,0.2,setosa
21,4.9,3.0,1.4,0.2,setosa
22,4.7,3.2,1.3,0.2,setosa
23,4.6,3.1,1.5,0.2,setosa
24,5.0,3.6,1.4,0.2,setosa


In [9]:
#Import the dataset in chunks that are loaded into memory once at a time
data_frame2 = pd.read_csv('flower.csv',delimiter=',',header=0,chunksize=20)
for chunk in data_frame2:
    print(len(chunk))
    print('This is the index range of the first row of the chunk: {}'.format(chunk.index))
print("Once you iterate all the chunks, You have to reload the dataframe, Once used it\'s gone!")

20
This is the index range of the first row of the chunk: RangeIndex(start=0, stop=20, step=1)
20
This is the index range of the first row of the chunk: RangeIndex(start=20, stop=40, step=1)
20
This is the index range of the first row of the chunk: RangeIndex(start=40, stop=60, step=1)
20
This is the index range of the first row of the chunk: RangeIndex(start=60, stop=80, step=1)
20
This is the index range of the first row of the chunk: RangeIndex(start=80, stop=100, step=1)
20
This is the index range of the first row of the chunk: RangeIndex(start=100, stop=120, step=1)
20
This is the index range of the first row of the chunk: RangeIndex(start=120, stop=140, step=1)
10
This is the index range of the first row of the chunk: RangeIndex(start=140, stop=150, step=1)
Once you iterate all the chunks, You have to reload the dataframe, Once used it's gone!


### Missing values

In [16]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                  columns=["col_1","col_2","col_3","col_4"])
df.head()

Unnamed: 0,col_1,col_2,col_3,col_4
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [17]:
df.fillna(0).head()

Unnamed: 0,col_1,col_2,col_3,col_4
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [23]:
median_col2 = df['col_2'].median()
values = {"col_1":0 , "col_2": median_col2, "col_3": 2, "col_4": 3}
df.fillna(values).head()

Unnamed: 0,col_1,col_2,col_3,col_4
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,3.0,2.0,5
3,0.0,3.0,2.0,4


### Data querying

In [47]:
data_frame = pd.read_csv('flower.csv',delimiter=',',header=0)
data_frame.columns = ['sepal_length','sepal_width','petal_length','petal_width','species']
data_frame.query("species == 'setosa'").head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [48]:
data_frame.query("sepal_length > 5 and petal_width < 0.5 and  1.5 <= petal_length <= 1.7 and petal_width == 0.4")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
5,5.4,3.9,1.7,0.4,setosa
15,5.7,4.4,1.5,0.4,setosa
21,5.1,3.7,1.5,0.4,setosa
31,5.4,3.4,1.5,0.4,setosa


### Applying functions

In [56]:
data_frame = pd.read_csv('flower.csv',delimiter=',',header=0)
data_frame.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [57]:
def multiply_by_10(val):
    return val*10
data_frame['sepal length'] = data_frame['sepal length'].apply(multiply_by_10)
data_frame.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,51.0,3.5,1.4,0.2,setosa
1,49.0,3.0,1.4,0.2,setosa
2,47.0,3.2,1.3,0.2,setosa
3,46.0,3.1,1.5,0.2,setosa
4,50.0,3.6,1.4,0.2,setosa


In [58]:
data_frame = pd.read_csv('flower.csv',delimiter=',',header=0)

def product_of_vals(row):
    return row[0]*row[1]*row[2]*row[3]
data_frame['product'] = data_frame.apply(product_of_vals,axis=1)
data_frame.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species,product
0,5.1,3.5,1.4,0.2,setosa,4.998
1,4.9,3.0,1.4,0.2,setosa,4.116
2,4.7,3.2,1.3,0.2,setosa,3.9104
3,4.6,3.1,1.5,0.2,setosa,4.278
4,5.0,3.6,1.4,0.2,setosa,5.04


### One Hot Encoding

In [68]:
data_frame = pd.read_csv('flower.csv',delimiter=',',header=0)
col = pd.get_dummies(data_frame['species'], prefix='type')
col

Unnamed: 0,type_setosa,type_versicolor,type_virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
145,0,0,1
146,0,0,1
147,0,0,1
148,0,0,1


In [69]:
new_df = pd.concat([data_frame,col],axis=1).drop(['species'],axis=1)

Unnamed: 0,sepal length,sepal width,petal length,petal width,type_setosa,type_versicolor,type_virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1
