In [74]:
import pandas as pd
import numpy as  np
from openpyxl import Workbook

Two types of Datastructure in Pandas
> Series - A series of data (1 dimensional)

> DataFrames - Tabular Data (2 dimensional)

In [4]:
# Creating series through a list
marks = pd.Series([50, 60, 70, 80])
marks

0    50
1    60
2    70
3    80
dtype: int64

In [5]:
# Access element through indexes
marks[1]

60

In [6]:
# Specifing custom index
marks = pd.Series([50, 60, 70, 80], index=['ali', 'sara', 'ahmed', 'tahir'])
marks

ali      50
sara     60
ahmed    70
tahir    80
dtype: int64

In [7]:
# Access element through custom index
marks['ahmed']

70

In [8]:
# Specifying your own datatype
marks = pd.Series([50, 60, 70, 80], index=['ali', 'sara', 'ahmed', 'tahir'], dtype=float)
marks

ali      50.0
sara     60.0
ahmed    70.0
tahir    80.0
dtype: float64

In [9]:
# Filtering data in a series
marks[marks>60]

ahmed    70.0
tahir    80.0
dtype: float64

In [10]:
# Fancy Indexing
marks[['sara', 'tahir']]

sara     60.0
tahir    80.0
dtype: float64

In [11]:
# Changing a vlue in a series
marks['sara'] = 65
marks

ali      50.0
sara     65.0
ahmed    70.0
tahir    80.0
dtype: float64

In [12]:
# Creating series from dictionary
marks1 = pd.Series({'ali':50, 'sara':65, 'ahmed':70, 'tahir':None})
marks1

ali      50.0
sara     65.0
ahmed    70.0
tahir     NaN
dtype: float64

In [13]:
pd.isnull(marks1)

ali      False
sara     False
ahmed    False
tahir     True
dtype: bool

In [14]:
pd.notnull(marks1)

ali       True
sara      True
ahmed     True
tahir    False
dtype: bool

In [15]:
marks.mean()

66.25

In [16]:
display(marks)
display(marks1)

ali      50.0
sara     65.0
ahmed    70.0
tahir    80.0
dtype: float64

ali      50.0
sara     65.0
ahmed    70.0
tahir     NaN
dtype: float64

In [17]:
# Adding two series data
marks + marks1

ali      100.0
sara     130.0
ahmed    140.0
tahir      NaN
dtype: float64

### Dataframes : Tabular Data
- Create a dataframe

`pd.DataFrame`

In [18]:
student = pd.DataFrame({
    'name':['Ali','Sara','Ahmed','Tahir'],
    'mark':[50,65,70,80],
    'cpga':[2.4, 2.6, 3, 3.3],
    'address':['Karachi', 'Lahore','Islamabad','Quetta']
})

student

Unnamed: 0,name,mark,cpga,address
0,Ali,50,2.4,Karachi
1,Sara,65,2.6,Lahore
2,Ahmed,70,3.0,Islamabad
3,Tahir,80,3.3,Quetta


In [19]:
# Displaying the columns of DataFrame
student.columns

Index(['name', 'mark', 'cpga', 'address'], dtype='object')

In [20]:
student['name']

0      Ali
1     Sara
2    Ahmed
3    Tahir
Name: name, dtype: object

In [21]:
# Fancy Indexing
student[['name', 'mark']]

Unnamed: 0,name,mark
0,Ali,50
1,Sara,65
2,Ahmed,70
3,Tahir,80


In [22]:
# Filtering
student[student['mark']>50]

Unnamed: 0,name,mark,cpga,address
1,Sara,65,2.6,Lahore
2,Ahmed,70,3.0,Islamabad
3,Tahir,80,3.3,Quetta


In [23]:
student.mark

0    50
1    65
2    70
3    80
Name: mark, dtype: int64

In [24]:
import numpy as np

# Using numpy function to specify values
student['semester']=np.arange(1, 5)

student[['semester', 'name']]

Unnamed: 0,semester,name
0,1,Ali
1,2,Sara
2,3,Ahmed
3,4,Tahir


In [25]:
doctors = pd.DataFrame({
    'names':['Asad', 'Rahim'],
    'qualification':['PHD', 'MS']
}, index=['d1', 'd2'])

doctors

Unnamed: 0,names,qualification
d1,Asad,PHD
d2,Rahim,MS


In [26]:
# Adding a new column
doctors['address']='Karachi'

In [27]:
doctors

Unnamed: 0,names,qualification,address
d1,Asad,PHD,Karachi
d2,Rahim,MS,Karachi


In [28]:
# Accessing row-wise data using index
doctors.loc['d1']

names               Asad
qualification        PHD
address          Karachi
Name: d1, dtype: object

In [29]:
# Accessing row-wise data using index location
doctors.iloc[0]

names               Asad
qualification        PHD
address          Karachi
Name: d1, dtype: object

In [30]:
# Accessing row-wise data using index location
doctors.iloc[0]

names               Asad
qualification        PHD
address          Karachi
Name: d1, dtype: object

In [31]:
import pandas as pd
import numpy as np

In [32]:
# Create a DataFrame of Cricket Teams
data = {
    'teams': ['Pakistan', 'India', 'Sri Lanka', 'S. Africa'],
    'played': [5, 6, 4, 5],
    'points': [10, 8, 8, 6]
}

stats = pd.DataFrame(data, index=['t1', 't2', 't3', 't4'])

display(stats)


Unnamed: 0,teams,played,points
t1,Pakistan,5,10
t2,India,6,8
t3,Sri Lanka,4,8
t4,S. Africa,5,6


In [33]:
# Display first two rows
stats.head(2)

Unnamed: 0,teams,played,points
t1,Pakistan,5,10
t2,India,6,8


In [34]:
# Display last two rows
stats.tail(2)

Unnamed: 0,teams,played,points
t3,Sri Lanka,4,8
t4,S. Africa,5,6


In [35]:
# Accessing a specific column
stats['teams']

t1     Pakistan
t2        India
t3    Sri Lanka
t4    S. Africa
Name: teams, dtype: object

In [36]:
# Accessing a specific row by index
stats.iloc[0]

teams     Pakistan
played           5
points          10
Name: t1, dtype: object

In [37]:
# Accessing a specific row by index key
stats.loc['t1']

teams     Pakistan
played           5
points          10
Name: t1, dtype: object

In [38]:
# Getting numpy array
stats.values

array([['Pakistan', 5, 10],
       ['India', 6, 8],
       ['Sri Lanka', 4, 8],
       ['S. Africa', 5, 6]], dtype=object)

In [39]:
# New column added with specific values
arr = np.arange(1, 5) # create np array
print(arr)

stats['No'] = arr # column added, passing array as values
display(stats)

stats.drop('No', axis=1, inplace=True) # remove added column in-place
display(stats)


[1 2 3 4]


Unnamed: 0,teams,played,points,No
t1,Pakistan,5,10,1
t2,India,6,8,2
t3,Sri Lanka,4,8,3
t4,S. Africa,5,6,4


Unnamed: 0,teams,played,points
t1,Pakistan,5,10
t2,India,6,8
t3,Sri Lanka,4,8
t4,S. Africa,5,6


In [40]:
stats['No'] = np.arange(1, 5)
display(stats)

Unnamed: 0,teams,played,points,No
t1,Pakistan,5,10,1
t2,India,6,8,2
t3,Sri Lanka,4,8,3
t4,S. Africa,5,6,4


In [41]:
# adding a row in last
stats.loc['t5'] = ['England', 5, 7, 5]
display(stats)

stats.drop('t5', axis=0, inplace=True)
display(stats)

Unnamed: 0,teams,played,points,No
t1,Pakistan,5,10,1
t2,India,6,8,2
t3,Sri Lanka,4,8,3
t4,S. Africa,5,6,4
t5,England,5,7,5


Unnamed: 0,teams,played,points,No
t1,Pakistan,5,10,1
t2,India,6,8,2
t3,Sri Lanka,4,8,3
t4,S. Africa,5,6,4


In [46]:
stats.drop('No', axis=1, inplace=True)
display(stats)

Unnamed: 0,teams,played,points
t1,Pakistan,5,10
t2,India,6,8
t3,Sri Lanka,4,8
t4,S. Africa,5,6


In [48]:
stats['Teams'] = ['PK', 'IN', 'SL', 'SA']
stats

Unnamed: 0,teams,played,points,Teams
t1,Pakistan,5,10,PK
t2,India,6,8,IN
t3,Sri Lanka,4,8,SL
t4,S. Africa,5,6,SA


In [49]:
stats['win'] = stats['points'] / 2
stats

Unnamed: 0,teams,played,points,Teams,win
t1,Pakistan,5,10,PK,5.0
t2,India,6,8,IN,4.0
t3,Sri Lanka,4,8,SL,4.0
t4,S. Africa,5,6,SA,3.0


In [50]:
# Change the datatype to integer
stats['win'] = stats['win'].astype(int)
display(stats)

Unnamed: 0,teams,played,points,Teams,win
t1,Pakistan,5,10,PK,5
t2,India,6,8,IN,4
t3,Sri Lanka,4,8,SL,4
t4,S. Africa,5,6,SA,3


In [51]:
stats['loss/tie'] = stats['played'] - stats['win']
display(stats)

Unnamed: 0,teams,played,points,Teams,win,loss/tie
t1,Pakistan,5,10,PK,5,0
t2,India,6,8,IN,4,2
t3,Sri Lanka,4,8,SL,4,0
t4,S. Africa,5,6,SA,3,2


In [53]:
# Get the team with maximum points
ind = stats['points'].idxmax()
display(stats.loc[ind])

teams       Pakistan
played             5
points            10
Teams             PK
win                5
loss/tie           0
Name: t1, dtype: object

In [54]:
np.sqrt(stats['played'])

t1    2.236068
t2    2.449490
t3    2.000000
t4    2.236068
Name: played, dtype: float64

In [55]:
stats.describe()

Unnamed: 0,played,points,win,loss/tie
count,4.0,4.0,4.0,4.0
mean,5.0,8.0,4.0,1.0
std,0.816497,1.632993,0.816497,1.154701
min,4.0,6.0,3.0,0.0
25%,4.75,7.5,3.75,0.0
50%,5.0,8.0,4.0,1.0
75%,5.25,8.5,4.25,2.0
max,6.0,10.0,5.0,2.0


In [56]:
display(stats)
stats.sum()

Unnamed: 0,teams,played,points,Teams,win,loss/tie
t1,Pakistan,5,10,PK,5,0
t2,India,6,8,IN,4,2
t3,Sri Lanka,4,8,SL,4,0
t4,S. Africa,5,6,SA,3,2


teams       PakistanIndiaSri LankaS. Africa
played                                   20
points                                   32
Teams                              PKINSLSA
win                                      16
loss/tie                                  4
dtype: object

In [62]:
# Save dataframe to file in CSV format
stats.to_csv('data.csv')

In [65]:
my_data = pd.read_csv('data.csv', index_col=0)
display(my_data)

Unnamed: 0,teams,played,points,Teams,win,loss/tie
t1,Pakistan,5,10,PK,5,0
t2,India,6,8,IN,4,2
t3,Sri Lanka,4,8,SL,4,0
t4,S. Africa,5,6,SA,3,2


In [66]:
my_data.loc['t4']

teams       S. Africa
played              5
points              6
Teams              SA
win                 3
loss/tie            2
Name: t4, dtype: object

In [76]:
my_data.columns

Index(['teams', 'played', 'points', 'Teams', 'win', 'loss/tie'], dtype='object')

In [79]:
my_data.columns = ['Teams', 'Played', 'Points', 'Tms', 'Win', 'Loss/Tie']
display(my_data)

Unnamed: 0,Teams,Played,Points,Tms,Win,Loss/Tie
t1,Pakistan,5,10,PK,5,0
t2,India,6,8,IN,4,2
t3,Sri Lanka,4,8,SL,4,0
t4,S. Africa,5,6,SA,3,2


In [81]:
df = pd.read_csv('students.csv')
display(df)

Unnamed: 0,Student Code,Degree,Student Name,Mid,Quiz 1,Quiz 2,Best of Quizzes,Assignment 1,Assignment 2,Best of Assignments,Total Sessional (50),Final (50),Total (100),Grade
0,022-14-19987,BS(CS),Abdul Basit,28,8.0,3.0,8,7.0,9.0,9,45,25.0,70,B
1,022-14-110233,BS(CS),Adeel Ahmed,17,,5.0,5,8.0,10.0,10,32,18.0,50,F
2,022-14-110585,BS(CS),Afrah Zareen,18,5.0,2.0,5,8.0,10.0,10,33,30.0,63,C
3,022-14-19718,BS(CS),Ahmed Ali Raza,14,7.0,2.0,7,,2.0,2,23,23.0,46,F
4,022-14-110648,BS(CS),Ahsan Ali Vohra,27,7.0,6.0,7,7.0,9.0,9,43,34.0,77,B
5,022-14-110232,BS(CS),Ameer Hamza,25,9.0,6.0,9,8.0,10.0,10,44,27.0,71,B
6,022-14-110588,BS(CS),Anas Ali Khan,28,5.0,6.0,6,8.0,10.0,10,44,30.0,74,B
7,022-14-110388,BS(CS),Aneebullah Niazi,26,9.0,6.0,9,8.0,10.0,10,45,40.0,85,A
8,022-14-110601,BS(CS),Areesha Sohail,19,9.0,4.0,9,7.0,9.0,9,37,24.0,61,C
9,022-14-110599,BS(CS),Arsalan,28,8.0,6.0,8,8.0,,8,44,40.0,84,A


In [83]:
df.describe()

Unnamed: 0,Mid,Quiz 1,Quiz 2,Best of Quizzes,Assignment 1,Assignment 2,Best of Assignments,Total Sessional (50),Final (50),Total (100)
count,48.0,45.0,45.0,48.0,45.0,36.0,48.0,48.0,47.0,48.0
mean,22.9375,7.288889,4.866667,7.020833,7.755556,9.444444,8.9375,38.895833,27.851064,66.166667
std,5.236558,1.561209,1.455397,1.973113,1.170772,1.6291,2.127892,7.179283,8.431253,15.227821
min,9.0,4.0,1.0,0.0,5.0,2.0,0.0,18.0,8.0,18.0
25%,20.0,6.0,4.0,6.0,7.0,9.0,8.75,36.75,22.5,60.0
50%,25.0,8.0,5.0,7.0,8.0,10.0,10.0,41.0,28.0,69.0
75%,27.0,9.0,6.0,9.0,8.0,10.0,10.0,44.0,33.5,75.25
max,30.0,9.0,7.0,9.0,10.0,11.0,11.0,46.0,45.0,91.0


In [84]:
df.isna().sum()

Student Code             0
Degree                   0
Student Name             0
Mid                      0
Quiz 1                   3
Quiz 2                   3
Best of Quizzes          0
Assignment 1             3
Assignment 2            12
Best of Assignments      0
Total Sessional (50)     0
Final (50)               1
Total (100)              0
Grade                    0
dtype: int64

In [85]:
(~df.isna()).sum()

Student Code            48
Degree                  48
Student Name            48
Mid                     48
Quiz 1                  45
Quiz 2                  45
Best of Quizzes         48
Assignment 1            45
Assignment 2            36
Best of Assignments     48
Total Sessional (50)    48
Final (50)              47
Total (100)             48
Grade                   48
dtype: int64

In [90]:
df['Student Name']

0                        Abdul Basit
1                        Adeel Ahmed
2                       Afrah Zareen
3                     Ahmed Ali Raza
4                    Ahsan Ali Vohra
5                        Ameer Hamza
6                      Anas Ali Khan
7                   Aneebullah Niazi
8                     Areesha Sohail
9                            Arsalan
10               Fatima Haider Warsi
11                       Habib Ullah
12               Hafiza Tooba Akbani
13                Hamza Abdul Jabbar
14                     Hareem Afshan
15                      Haseeb Sajid
16                      Hassam Ahmed
17                      Khalid Anwer
18                     Madiha Jabeen
19                   Mohammad Hunain
20                    Muhammad Aamir
21    Muhammad Abdul Rehman Siddiqui
22                 Muhammad Abdullah
23                Muhammad Ali Iqbal
24                    Muhammad Bilal
25                    Muhammad Faraz
26           Muhammad Ghazali Faridi
2

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup