In [1]:
import pandas as pd
import numpy as np

# DataFrame and Series creation

In [28]:
rng = np.random.RandomState(47)

Lets create some basic pandas objects like Series and DataFrame

In [30]:
series = pd.Series(rng.randint(0,10,4)) # random 4 integers from 0 to 10
series

0    8
1    3
2    0
3    7
dtype: int64

Creating dataframe from two lists

In [7]:
x = np.array(['a', 'b', 'c'])

In [27]:
arrs = rng.randint(0,10,(3,10))

In [12]:
arrs

array([[7, 6, 7, 8, 8, 3, 0, 7, 0, 7],
       [7, 1, 7, 2, 2, 1, 7, 4, 8, 9],
       [2, 9, 1, 5, 0, 9, 2, 0, 2, 1]])

In [15]:
df = pd.DataFrame(data=arrs.T, columns=x)
df.head()

Unnamed: 0,a,b,c
0,7,7,2
1,6,1,9
2,7,7,1
3,8,2,5
4,8,2,0


We can also create a dataframe from a dictionary, the dictionary keys will be the columns

In [24]:
dictionary = dict(zip(x, arrs))
dictionary

{'a': array([7, 6, 7, 8, 8, 3, 0, 7, 0, 7]),
 'b': array([7, 1, 7, 2, 2, 1, 7, 4, 8, 9]),
 'c': array([2, 9, 1, 5, 0, 9, 2, 0, 2, 1])}

In [25]:
df = pd.DataFrame(dictionary)
df.head()

Unnamed: 0,a,b,c
0,7,7,2
1,6,1,9
2,7,7,1
3,8,2,5
4,8,2,0


In order to visualize the DataFrames we will create a simple function to display it more easly

In [243]:
from IPython.display import display_html
def display_pds(*args):
    html_str=''
    for _df in args:
        html_str += _df.to_html()
    display_html(html_str.replace('table','table style="display:inline; margin:5px;"'),raw=True)

We can make use of the Pandas function `pd.concat()` as it uses NumPy function `np.concatenate`

In [244]:
df1 = pd.DataFrame(rng.randint(0,10,(2,2)), columns=['A', 'B'])
df2 = pd.DataFrame(rng.randint(0,10,(2,2)), columns=['A', 'B'])
df_concat = pd.concat([df1, df2], ignore_index=True)
display_pds(df1, df2, df_concat)

Unnamed: 0,A,B
0,9,5
1,3,2

Unnamed: 0,A,B
0,6,5
1,0,3

Unnamed: 0,A,B
0,9,5
1,3,2
2,6,5
3,0,3


We can also specify the axis along we want to concatenate our dataframes using the `axis` argument

In [245]:
df1 = pd.DataFrame(rng.randint(0,10,(2,2)), columns=['A', 'B'])
df2 = pd.DataFrame(rng.randint(0,10,(2,2)), columns=['C', 'D'])
df_concat = pd.concat([df1, df2], axis=1)
display_pds(df1, df2, df_concat)

Unnamed: 0,A,B
0,5,6
1,7,2

Unnamed: 0,C,D
0,8,4
1,8,4

Unnamed: 0,A,B,C,D
0,5,6,8,4
1,7,2,8,4


If the indexs are not aligned than we will get NaN values in the positions there are undefined values

In [246]:
df1 = pd.DataFrame(rng.randint(0,10,(2,2)), columns=['A', 'B'], index=[0,1])
df2 = pd.DataFrame(rng.randint(0,10,(2,2)), columns=['A', 'B'], index=[2,3])
df_concat = pd.concat([df1, df2], axis=1)
display_pds(df1, df2, df_concat)

Unnamed: 0,A,B
0,5,5
1,5,9

Unnamed: 0,A,B
2,3,7
3,5,5

Unnamed: 0,A,B,A.1,B.1
0,5.0,5.0,,
1,5.0,9.0,,
2,,,3.0,7.0
3,,,5.0,5.0


### NumPy Ufuncs on pandas objects

When applying numpy function to pandas dataframe we get a pandas dataframe result

In [17]:
np.power(df, 2).head()

Unnamed: 0,a,b,c
0,49,49,4
1,36,1,81
2,49,49,1
3,64,4,25
4,64,4,0


### Lets see some examples on the iris dataset from the sklearn package

In [32]:
from sklearn import datasets

In [198]:
iris = datasets.load_iris()
iris_data = np.array(iris.data)
target = np.array(iris.target) # classification of the flower ranges 0 to 2
names = iris.feature_names # name of the columns (features of the flowers)

In [199]:
df = pd.DataFrame(iris_data, columns=names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


Adding the target classification to the dataframe

In [201]:
df['target'] = target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


If we already got the target in a dataframe we can concanating the dataframes

In [83]:
target_pd = pd.DataFrame({'target': target})
df = pd.DataFrame(iris_data, columns=names)

In [84]:
df = pd.concat([df, target_pd], axis=1)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


pandas has 2 main indexers: iloc & loc 

In [87]:
df.iloc[0]

sepal length (cm)    5.1
sepal width (cm)     3.5
petal length (cm)    1.4
petal width (cm)     0.2
target               0.0
Name: 0, dtype: float64

If we only want to see the values

In [89]:
df.iloc[0].ravel()

array([5.1, 3.5, 1.4, 0.2, 0. ])

In [90]:
df.iloc[0].values

array([5.1, 3.5, 1.4, 0.2, 0. ])

show only the first 2 columns and rows

In [91]:
df.iloc[:2, :2]

Unnamed: 0,sepal length (cm),sepal width (cm)
0,5.1,3.5
1,4.9,3.0


when using the loc indexer we can extract data using the column name (label), we can also use hybrid indexers

In [103]:
df.loc[:5, 'sepal length (cm)']

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal length (cm), dtype: float64

This is also equivalent to:

In [104]:
df['sepal length (cm)'][:5]

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal length (cm), dtype: float64

We can also condition the results of the rows we want to extract

In [102]:
df.loc[df['target'] == 0, 'sepal length (cm)'].ravel()

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. ])

In [107]:
df.loc[0, 'sepal length (cm)']

5.1

In [125]:
df.loc[:3, 'sepal length (cm)': 'petal width (cm)']

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2


Other indexing conventions

In [108]:
df[0:3]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [115]:
df[df['sepal length (cm)'] > 7.5]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
105,7.6,3.0,6.6,2.1,2
117,7.7,3.8,6.7,2.2,2
118,7.7,2.6,6.9,2.3,2
122,7.7,2.8,6.7,2.0,2
131,7.9,3.8,6.4,2.0,2
135,7.7,3.0,6.1,2.3,2


## Lets use the describe() method to see some statistics on the dataframe

In [202]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


#### We have 150 rows, lets see if we have any duplicates and get rid of them
we will not use the target column

In [203]:
columns_for_criteria = df.columns[:4]

In [204]:
df = df.drop_duplicates(subset=columns_for_criteria)

we have one duplicate row

In [205]:
len(df)

149

With the statistics above we'll add some aggregated data to the dataframe

In [206]:
means = df.mean()

In [207]:
df['s_l_mean'] = df.iloc[:, 0] > means[0]
df['s_w_mean'] = df.iloc[:, 1] > means[1]
df['p_l_mean'] = df.iloc[:, 2] > means[2]
df['p_w_mean'] = df.iloc[:, 3] > means[3]

In [208]:
df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,s_l_mean,s_w_mean,p_l_mean,p_w_mean
145,6.7,3.0,5.2,2.3,2,True,False,True,True
146,6.3,2.5,5.0,1.9,2,True,False,True,True
147,6.5,3.0,5.2,2.0,2,True,False,True,True
148,6.2,3.4,5.4,2.3,2,True,True,True,True
149,5.9,3.0,5.1,1.8,2,True,False,True,True


In [209]:
renamed_columns = {"s_l_mean": "above sepal length mean",
                   "s_w_mean": "above sepal width mean",
                   "p_l_mean": "above petal length mean",
                   "p_w_mean": "above petal width mean"}
df.rename(columns=renamed_columns, inplace=True)

In [210]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,above sepal length mean,above sepal width mean,above petal length mean,above petal width mean
0,5.1,3.5,1.4,0.2,0,False,True,False,False
1,4.9,3.0,1.4,0.2,0,False,False,False,False
2,4.7,3.2,1.3,0.2,0,False,True,False,False
3,4.6,3.1,1.5,0.2,0,False,True,False,False
4,5.0,3.6,1.4,0.2,0,False,True,False,False


Lets reorder the dataframe and move the target to be the last column

In [211]:
reordered_columns = list(df.columns)
reordered_columns.append(reordered_columns.pop(reordered_columns.index('target')))

In [212]:
df = df.reindex(columns = reordered_columns)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),above sepal length mean,above sepal width mean,above petal length mean,above petal width mean,target
0,5.1,3.5,1.4,0.2,False,True,False,False,0
1,4.9,3.0,1.4,0.2,False,False,False,False,0
2,4.7,3.2,1.3,0.2,False,True,False,False,0
3,4.6,3.1,1.5,0.2,False,True,False,False,0
4,5.0,3.6,1.4,0.2,False,True,False,False,0


we will add a new column if all features are above mean

In [218]:
criteria = df['above sepal length mean'] & df['above sepal width mean'] & \
           df['above petal length mean'] & df['above petal width mean']
df['all above mean'] = criteria

In [220]:
df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),above sepal length mean,above sepal width mean,above petal length mean,above petal width mean,target,all above mean
145,6.7,3.0,5.2,2.3,True,False,True,True,2,False
146,6.3,2.5,5.0,1.9,True,False,True,True,2,False
147,6.5,3.0,5.2,2.0,True,False,True,True,2,False
148,6.2,3.4,5.4,2.3,True,True,True,True,2,True
149,5.9,3.0,5.1,1.8,True,False,True,True,2,False
