In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Series - 1D Array

In [7]:
# A series is a 1D array which can hold anything. It is indexed by an index. 
# By default is it integer. You can specify anything else
s = pd.Series([1,3,5,np.nan,6,8])
t = pd.Series([[1,3,5,np.nan,6,8], [1,3,5,np.nan,6,8], [1,3,5,np.nan,6,8]])
u = pd.Series([8,2,3,"yo","hi",8,4.0], index=["a", "b", "c","d", "hello", "f", "z"])
print (s)
print (t)
print(u)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
0    [1, 3, 5, nan, 6, 8]
1    [1, 3, 5, nan, 6, 8]
2    [1, 3, 5, nan, 6, 8]
dtype: object
a         8
b         2
c         3
d        yo
hello    hi
f         8
z         4
dtype: object


In [8]:
s[2]

5.0

In [9]:
t[0]

[1, 3, 5, nan, 6, 8]

In [5]:
print("u['hello']:", u["hello"], ' u[5]:', u[5])

u['hello']: hi  u[5]: 8


# Dataframe - 2d array

In [26]:
# 2D datastructure. Default index for rows and columns are integers. 
# You can change the rows by passing index attribute and columns by passing columns attribute
df = pd.DataFrame(np.random.randn(6,4), index=['a','b','cc', 'd',2,'e'], columns=['a','b',1,'d'])
print (df)

           a         b         1         d
a   0.685167 -0.594003  0.526106 -0.334567
b  -0.640999 -0.765958  1.051069  0.557890
cc  0.695897  0.374637  0.170478 -0.549513
d  -0.093437  1.437555  0.078734  0.187099
2   1.745763 -0.811318  0.006612 -0.513819
e   1.357226 -0.856825  0.536754  0.476268


In [34]:
df["a"] #indexes into column

a     0.685167
b    -0.640999
cc    0.695897
d    -0.093437
2     1.745763
e     1.357226
Name: a, dtype: float64

In [35]:
df["a"]["cc"] #indexes into an element

0.6958971095331139

In [38]:
# You can also create a dataframe by passing  a dictionary
df = pd.DataFrame({'a': [1,'hi',3,4,5],
                   'b': [6,7,8,9,10],
                   'c': [11,12,13,14,15],
                   'd': [16,17,18,19,20]})
print(df)

    a   b   c   d
0   1   6  11  16
1  hi   7  12  17
2   3   8  13  18
3   4   9  14  19
4   5  10  15  20


In [40]:
print (df.dtypes)

a    object
b     int64
c     int64
d     int64
dtype: object


In [102]:
df = pd.DataFrame(np.random.rand(30,4), columns=['a','b','c','d'])
# Here NaN represents value is missing
df['c'][0] = np.NAN
df['c'][2] = np.NAN
df['c'][29] = np.NAN
print(df.head())
print(df.tail())

          a         b         c         d
0  0.504560  0.820037       NaN  0.858389
1  0.846102  0.344087  0.572294  0.715991
2  0.373753  0.352058       NaN  0.533771
3  0.696601  0.861036  0.646316  0.463349
4  0.677217  0.083729  0.374331  0.899031
           a         b         c         d
25  0.153476  0.302053  0.996457  0.417623
26  0.868749  0.860391  0.630968  0.063175
27  0.501637  0.782758  0.425776  0.099909
28  0.732402  0.058306  0.870334  0.294619
29  0.154485  0.676002       NaN  0.629088


In [103]:
print(df.index)
print(df.columns)
print(df.values)

RangeIndex(start=0, stop=30, step=1)
Index(['a', 'b', 'c', 'd'], dtype='object')
[[0.50455989 0.82003707        nan 0.85838877]
 [0.84610179 0.34408668 0.57229444 0.71599116]
 [0.37375292 0.35205804        nan 0.53377143]
 [0.69660115 0.86103623 0.64631565 0.46334907]
 [0.6772167  0.08372917 0.37433102 0.89903099]
 [0.41901738 0.20325093 0.88599504 0.56537203]
 [0.85514127 0.20336399 0.74948    0.19551213]
 [0.66367961 0.55547861 0.85395004 0.29129772]
 [0.56925973 0.70121855 0.03779832 0.0444342 ]
 [0.51783089 0.24160602 0.18888318 0.64344871]
 [0.97560023 0.67967422 0.97851643 0.78845486]
 [0.98543014 0.4398332  0.77721714 0.67177731]
 [0.23421712 0.23368179 0.21267913 0.3110895 ]
 [0.44923135 0.63661673 0.90480214 0.90677801]
 [0.28516263 0.90993745 0.90227913 0.85607377]
 [0.53507196 0.99254798 0.3252659  0.8959904 ]
 [0.3532468  0.4257623  0.00527295 0.52157429]
 [0.70966703 0.60725335 0.54952481 0.1628034 ]
 [0.47299205 0.72324079 0.01476012 0.66578827]
 [0.68601307 0.72089571 0.

In [104]:
print (df.describe())

               a          b          c          d
count  30.000000  30.000000  27.000000  30.000000
mean    0.547589   0.542287   0.598199   0.499508
std     0.241659   0.265508   0.312520   0.276553
min     0.135295   0.058306   0.005273   0.044434
25%     0.377841   0.312562   0.384185   0.292128
50%     0.526451   0.604072   0.630968   0.527673
75%     0.706401   0.722655   0.878164   0.708163
max     0.985430   0.992548   0.996457   0.906778


In [105]:
# Here axis represents either row or column (only 2 axis) 
# this sorts only the index(row or column) either in ascending or descending order 
print(df.sort_index(axis=1, ascending=False))
print(df.sort_index(axis=0, ascending=False))

           d         c         b         a
0   0.858389       NaN  0.820037  0.504560
1   0.715991  0.572294  0.344087  0.846102
2   0.533771       NaN  0.352058  0.373753
3   0.463349  0.646316  0.861036  0.696601
4   0.899031  0.374331  0.083729  0.677217
5   0.565372  0.885995  0.203251  0.419017
6   0.195512  0.749480  0.203364  0.855141
7   0.291298  0.853950  0.555479  0.663680
8   0.044434  0.037798  0.701219  0.569260
9   0.643449  0.188883  0.241606  0.517831
10  0.788455  0.978516  0.679674  0.975600
11  0.671777  0.777217  0.439833  0.985430
12  0.311089  0.212679  0.233682  0.234217
13  0.906778  0.904802  0.636617  0.449231
14  0.856074  0.902279  0.909937  0.285163
15  0.895990  0.325266  0.992548  0.535072
16  0.521574  0.005273  0.425762  0.353247
17  0.162803  0.549525  0.607253  0.709667
18  0.665788  0.014760  0.723241  0.472992
19  0.432733  0.865107  0.720896  0.686013
20  0.684677  0.590930  0.204465  0.390105
21  0.114156  0.394038  0.600890  0.716784
22  0.72665

In [106]:
# This sorts values based on the column specified in either ascending or descending order
df2 = df.sort_values(by='b', ascending = False) 

In [107]:
print(df2[0:4])
print ("\n")
print(df2[0:4].index)
print ("\n")
print(df2[0:4].values)

           a         b         c         d
15  0.535072  0.992548  0.325266  0.895990
14  0.285163  0.909937  0.902279  0.856074
3   0.696601  0.861036  0.646316  0.463349
26  0.868749  0.860391  0.630968  0.063175


Int64Index([15, 14, 3, 26], dtype='int64')


[[0.53507196 0.99254798 0.3252659  0.8959904 ]
 [0.28516263 0.90993745 0.90227913 0.85607377]
 [0.69660115 0.86103623 0.64631565 0.46334907]
 [0.86874871 0.86039067 0.63096765 0.06317519]]


# Selecting rows by key or index

In [108]:
# the .loc function helps us to select particular rows based on their keys
# the first argument can contain a list of keys of the rows that need to be returned
df2.loc[[22,3],:]

Unnamed: 0,a,b,c,d
22,0.275454,0.579854,0.89129,0.726657
3,0.696601,0.861036,0.646316,0.463349


In [109]:
# the .iloc function helps us to select particular rows based on their positions in the dataframe
# the first argument can contain a list of positions that need to be returned
df2.iloc[0:4,:]

Unnamed: 0,a,b,c,d
15,0.535072,0.992548,0.325266,0.89599
14,0.285163,0.909937,0.902279,0.856074
3,0.696601,0.861036,0.646316,0.463349
26,0.868749,0.860391,0.630968,0.063175


In [110]:
df[df.c>0.5]

Unnamed: 0,a,b,c,d
1,0.846102,0.344087,0.572294,0.715991
3,0.696601,0.861036,0.646316,0.463349
5,0.419017,0.203251,0.885995,0.565372
6,0.855141,0.203364,0.74948,0.195512
7,0.66368,0.555479,0.85395,0.291298
10,0.9756,0.679674,0.978516,0.788455
11,0.98543,0.439833,0.777217,0.671777
13,0.449231,0.636617,0.904802,0.906778
14,0.285163,0.909937,0.902279,0.856074
17,0.709667,0.607253,0.549525,0.162803


In [111]:
# Dropping rows with missing data
df.dropna(how='any').head()

Unnamed: 0,a,b,c,d
1,0.846102,0.344087,0.572294,0.715991
3,0.696601,0.861036,0.646316,0.463349
4,0.677217,0.083729,0.374331,0.899031
5,0.419017,0.203251,0.885995,0.565372
6,0.855141,0.203364,0.74948,0.195512


In [112]:
df.fillna(value=0).head()

Unnamed: 0,a,b,c,d
0,0.50456,0.820037,0.0,0.858389
1,0.846102,0.344087,0.572294,0.715991
2,0.373753,0.352058,0.0,0.533771
3,0.696601,0.861036,0.646316,0.463349
4,0.677217,0.083729,0.374331,0.899031


In [114]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# load dataset into Pandas DataFrame
df = pd.read_csv(url) # invalid as it treats first row as column names
print(df) 

     5.1  3.5  1.4  0.2     Iris-setosa
0    4.9  3.0  1.4  0.2     Iris-setosa
1    4.7  3.2  1.3  0.2     Iris-setosa
2    4.6  3.1  1.5  0.2     Iris-setosa
3    5.0  3.6  1.4  0.2     Iris-setosa
4    5.4  3.9  1.7  0.4     Iris-setosa
5    4.6  3.4  1.4  0.3     Iris-setosa
6    5.0  3.4  1.5  0.2     Iris-setosa
7    4.4  2.9  1.4  0.2     Iris-setosa
8    4.9  3.1  1.5  0.1     Iris-setosa
9    5.4  3.7  1.5  0.2     Iris-setosa
10   4.8  3.4  1.6  0.2     Iris-setosa
11   4.8  3.0  1.4  0.1     Iris-setosa
12   4.3  3.0  1.1  0.1     Iris-setosa
13   5.8  4.0  1.2  0.2     Iris-setosa
14   5.7  4.4  1.5  0.4     Iris-setosa
15   5.4  3.9  1.3  0.4     Iris-setosa
16   5.1  3.5  1.4  0.3     Iris-setosa
17   5.7  3.8  1.7  0.3     Iris-setosa
18   5.1  3.8  1.5  0.3     Iris-setosa
19   5.4  3.4  1.7  0.2     Iris-setosa
20   5.1  3.7  1.5  0.4     Iris-setosa
21   4.6  3.6  1.0  0.2     Iris-setosa
22   5.1  3.3  1.7  0.5     Iris-setosa
23   4.8  3.4  1.9  0.2     Iris-setosa


In [116]:
df1 = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])
print(df1)

     sepal length  sepal width  petal length  petal width          target
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
5             5.4          3.9           1.7          0.4     Iris-setosa
6             4.6          3.4           1.4          0.3     Iris-setosa
7             5.0          3.4           1.5          0.2     Iris-setosa
8             4.4          2.9           1.4          0.2     Iris-setosa
9             4.9          3.1           1.5          0.1     Iris-setosa
10            5.4          3.7           1.5          0.2     Iris-setosa
11            4.8          3.4           1.6          0.2     Iris-setosa
12            4.8          3.0        