# Pandas - Series & Dataframes

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import re
import math

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Series

In [4]:
# Create series from Nump Array
v = np.array([1,2,3,4,5,6,7])
s1 = pd.Series(v)
s1

0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int32

In [43]:
s1.count()

7

In [44]:
len(s1)

7

In [45]:
s1.ndim

1

In [46]:
s1.shape

(7,)

In [47]:
s1.nbytes

28

In [48]:
s1.dtype

dtype('int32')

In [49]:
s1.size

7

In [6]:
# Create Series using Random and Range function
v2 = np.random.random(10)
ind2 = np.arange(0,10)
s = pd.Series(v2,ind2)
v2 , ind2 , s

(array([0.0686242 , 0.32712138, 0.25826914, 0.85504065, 0.32332033,
        0.66923512, 0.7307074 , 0.58234722, 0.9387143 , 0.14911888]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 0    0.068624
 1    0.327121
 2    0.258269
 3    0.855041
 4    0.323320
 5    0.669235
 6    0.730707
 7    0.582347
 8    0.938714
 9    0.149119
 dtype: float64)

# DataFrame

# Create DataFrame

In [7]:
df = pd.DataFrame()
df

In [8]:
# Create Dataframe using List
lang = ['Java' , 'Python' , 'C' , 'C++']
df = pd.DataFrame(lang)
df

Unnamed: 0,0
0,Java
1,Python
2,C
3,C++


In [9]:
# Add column in the Dataframe
rating = [1,2,3,4]
df[1] = rating
df

Unnamed: 0,0,1
0,Java,1
1,Python,2
2,C,3
3,C++,4


In [10]:
# Create Dataframe from Dictionary
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df2 = pd.DataFrame(data)
df3 = pd.DataFrame(data, index=['row1', 'row2'], columns=['a', 'b'])
df4 = pd.DataFrame(data, index=['row1', 'row2'], columns=['a', 'b' ,'c'])
df5 = pd.DataFrame(data, index=['row1', 'row2'], columns=['a', 'b' ,'c' , 'd'])
# Create Dataframe from Dictionary
df0 = pd.DataFrame({'ID' :[1,2,3,4] , 'Name' :['Asif' , 'Basit' , 'Ross' , 'John']})
# Create a DataFrame from Dictionary of Series
dict = {'A' : pd.Series([1, 2, 3],    index=['a', 'b', 'c']),
        'B' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df1 = pd.DataFrame(dict)

In [12]:
df2

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [13]:
df3


Unnamed: 0,a,b
row1,1,2
row2,5,10


In [14]:
df4

Unnamed: 0,a,b,c
row1,1,2,
row2,5,10,20.0


In [15]:
df5

Unnamed: 0,a,b,c,d
row1,1,2,,
row2,5,10,20.0,


In [16]:
df0

Unnamed: 0,ID,Name
0,1,Asif
1,2,Basit
2,3,Ross
3,4,John


In [17]:
df1

Unnamed: 0,A,B
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [19]:
#Dataframe of Random Numbers with Date Indices
dates = pd.date_range(start='2020-01-20', end='2020-01-26')
dates

DatetimeIndex(['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23',
               '2020-01-24', '2020-01-25', '2020-01-26'],
              dtype='datetime64[ns]', freq='D')

In [20]:
dates = pd.date_range('today',periods= 7)
dates

DatetimeIndex(['2020-08-16 20:32:20.128443', '2020-08-17 20:32:20.128443',
               '2020-08-18 20:32:20.128443', '2020-08-19 20:32:20.128443',
               '2020-08-20 20:32:20.128443', '2020-08-21 20:32:20.128443',
               '2020-08-22 20:32:20.128443'],
              dtype='datetime64[ns]', freq='D')

In [21]:
dates = pd.date_range(start='2020-01-20', periods=7)
dates

DatetimeIndex(['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23',
               '2020-01-24', '2020-01-25', '2020-01-26'],
              dtype='datetime64[ns]', freq='D')

In [22]:
M = np.random.random((7,7))
M

array([[6.50031389e-01, 8.89054995e-01, 5.56096073e-01, 2.07243207e-01,
        3.30189475e-01, 6.86247134e-01, 1.11591031e-01],
       [5.82265594e-01, 5.58995568e-02, 1.79723761e-01, 9.19690937e-01,
        1.44054906e-01, 2.20840857e-01, 1.46511502e-01],
       [3.62067835e-01, 9.53368791e-01, 1.35085180e-01, 7.69970123e-01,
        7.47788751e-01, 6.53152572e-01, 9.29641508e-01],
       [9.39313215e-01, 1.57585155e-01, 2.97221116e-01, 6.35310435e-02,
        3.47926822e-01, 2.66806963e-01, 3.27204049e-01],
       [2.28748066e-01, 6.35699692e-01, 5.58684631e-01, 4.57999745e-01,
        6.72059525e-01, 1.36846387e-01, 8.16747634e-01],
       [2.23857173e-01, 8.36410469e-01, 1.81728898e-01, 7.90967008e-01,
        8.41048502e-01, 9.60541899e-01, 7.96481031e-01],
       [6.28087830e-04, 1.35314882e-01, 2.58400666e-01, 6.57684787e-01,
        6.62761526e-01, 6.09505138e-02, 2.23982254e-01]])

In [23]:
dframe = pd.DataFrame(M , index=dates)
dframe

Unnamed: 0,0,1,2,3,4,5,6
2020-01-20,0.650031,0.889055,0.556096,0.207243,0.330189,0.686247,0.111591
2020-01-21,0.582266,0.0559,0.179724,0.919691,0.144055,0.220841,0.146512
2020-01-22,0.362068,0.953369,0.135085,0.76997,0.747789,0.653153,0.929642
2020-01-23,0.939313,0.157585,0.297221,0.063531,0.347927,0.266807,0.327204
2020-01-24,0.228748,0.6357,0.558685,0.458,0.67206,0.136846,0.816748
2020-01-25,0.223857,0.83641,0.181729,0.790967,0.841049,0.960542,0.796481
2020-01-26,0.000628,0.135315,0.258401,0.657685,0.662762,0.060951,0.223982


In [24]:
#Changing Column Names
dframe.columns = ['C1' , 'C2' , 'C3', 'C4', 'C5', 'C6', 'C7']
dframe

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.650031,0.889055,0.556096,0.207243,0.330189,0.686247,0.111591
2020-01-21,0.582266,0.0559,0.179724,0.919691,0.144055,0.220841,0.146512
2020-01-22,0.362068,0.953369,0.135085,0.76997,0.747789,0.653153,0.929642
2020-01-23,0.939313,0.157585,0.297221,0.063531,0.347927,0.266807,0.327204
2020-01-24,0.228748,0.6357,0.558685,0.458,0.67206,0.136846,0.816748
2020-01-25,0.223857,0.83641,0.181729,0.790967,0.841049,0.960542,0.796481
2020-01-26,0.000628,0.135315,0.258401,0.657685,0.662762,0.060951,0.223982


In [25]:
# List Index
dframe.index

DatetimeIndex(['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23',
               '2020-01-24', '2020-01-25', '2020-01-26'],
              dtype='datetime64[ns]', freq='D')

In [26]:
# List Column Names
dframe.columns

Index(['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7'], dtype='object')

In [27]:
# Sort Dataframe by Column 'C1' in Ascending Order
dframe.sort_values(by='C1')

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-26,0.000628,0.135315,0.258401,0.657685,0.662762,0.060951,0.223982
2020-01-25,0.223857,0.83641,0.181729,0.790967,0.841049,0.960542,0.796481
2020-01-24,0.228748,0.6357,0.558685,0.458,0.67206,0.136846,0.816748
2020-01-22,0.362068,0.953369,0.135085,0.76997,0.747789,0.653153,0.929642
2020-01-21,0.582266,0.0559,0.179724,0.919691,0.144055,0.220841,0.146512
2020-01-20,0.650031,0.889055,0.556096,0.207243,0.330189,0.686247,0.111591
2020-01-23,0.939313,0.157585,0.297221,0.063531,0.347927,0.266807,0.327204


In [28]:
# Sort Dataframe by Column 'C1' in Descending Order
dframe.sort_values(by='C1' , ascending=False)

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-23,0.939313,0.157585,0.297221,0.063531,0.347927,0.266807,0.327204
2020-01-20,0.650031,0.889055,0.556096,0.207243,0.330189,0.686247,0.111591
2020-01-21,0.582266,0.0559,0.179724,0.919691,0.144055,0.220841,0.146512
2020-01-22,0.362068,0.953369,0.135085,0.76997,0.747789,0.653153,0.929642
2020-01-24,0.228748,0.6357,0.558685,0.458,0.67206,0.136846,0.816748
2020-01-25,0.223857,0.83641,0.181729,0.790967,0.841049,0.960542,0.796481
2020-01-26,0.000628,0.135315,0.258401,0.657685,0.662762,0.060951,0.223982


# Delete Column in DataFrame

In [29]:
df1

Unnamed: 0,A,B
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [30]:
# Delete Column using "del" function
del df1['B']
df1

Unnamed: 0,A
a,1.0
b,2.0
c,3.0
d,


In [31]:
dict = {'A' : pd.Series([1, 2, 3,11],    index=['a', 'b', 'c','d']),
        'B' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df12 = pd.DataFrame(dict)
df12

Unnamed: 0,A,B
a,1,1
b,2,2
c,3,3
d,11,4


In [32]:
df12.drop(['A'], axis=1,inplace=True)   #inplace = true makes the change permanent
df12

Unnamed: 0,B
a,1
b,2
c,3
d,4


# Delete Rows in DataFrame

In [33]:
col1 = np.linspace(10, 100, 30)                 #np.linspace()
col2 = np.random.randint(10,100,30)
df10 = pd.DataFrame({"C1" : col1 , "C2" :col2})
df10

Unnamed: 0,C1,C2
0,10.0,93
1,13.103448,64
2,16.206897,86
3,19.310345,67
4,22.413793,20
5,25.517241,32
6,28.62069,43
7,31.724138,43
8,34.827586,61
9,37.931034,86


In [34]:
# Delete rows with index values 17,18,19
df10 = df10.drop([17,18,19], axis=0)
df10

Unnamed: 0,C1,C2
0,10.0,93
1,13.103448,64
2,16.206897,86
3,19.310345,67
4,22.413793,20
5,25.517241,32
6,28.62069,43
7,31.724138,43
8,34.827586,61
9,37.931034,86


In [35]:
# Delete rows with index values 16 without using assignment operation
df10.drop([16], axis=0,inplace=True)
df10

Unnamed: 0,C1,C2
0,10.0,93
1,13.103448,64
2,16.206897,86
3,19.310345,67
4,22.413793,20
5,25.517241,32
6,28.62069,43
7,31.724138,43
8,34.827586,61
9,37.931034,86


In [37]:
#Delete first three rows
df10 = df10.iloc[3:,]
df10

Unnamed: 0,C1,C2
3,19.310345,67
4,22.413793,20
5,25.517241,32
6,28.62069,43
7,31.724138,43
8,34.827586,61
9,37.931034,86
10,41.034483,60
11,44.137931,73
12,47.241379,79


In [38]:
#Delete last four rows
df10 = df10.iloc[:-4,]
df10

Unnamed: 0,C1,C2
3,19.310345,67
4,22.413793,20
5,25.517241,32
6,28.62069,43
7,31.724138,43
8,34.827586,61
9,37.931034,86
10,41.034483,60
11,44.137931,73
12,47.241379,79


In [39]:
#Keep top 10 rows
df10 = df10.iloc[:10,]
df10

Unnamed: 0,C1,C2
3,19.310345,67
4,22.413793,20
5,25.517241,32
6,28.62069,43
7,31.724138,43
8,34.827586,61
9,37.931034,86
10,41.034483,60
11,44.137931,73
12,47.241379,79


In [40]:
df10.index[df10['C2'] == 43].tolist()

[6, 7]

In [42]:
# Delete row based on Column value
df10.drop(df10.index[df10['C2'] == 61].tolist() , axis=0,inplace=True)
df10

Unnamed: 0,C1,C2
3,19.310345,67
4,22.413793,20
5,25.517241,32
6,28.62069,43
7,31.724138,43
9,37.931034,86
10,41.034483,60
11,44.137931,73
12,47.241379,79
