# Pandas

## Pandas Series
A Pandas Series is like a column in a table. It is a one-dimensional array holding data of any type.

In [4]:
import numpy as np
import pandas as pd

## Series from List

In [5]:
country = ['India', 'Pakistan', 'Bangladesh', 'Sri Lanka', 'Nepal', 'Bhutan', 'Maldives']
pd.Series(country)

0         India
1      Pakistan
2    Bangladesh
3     Sri Lanka
4         Nepal
5        Bhutan
6      Maldives
dtype: object

In [6]:
runs = [51,45,67,12,34,0,98]
runs_ser = pd.Series(runs)

In [7]:
marks = [45,67,89,90,34,56,78]
subjects = ['Maths', 'Statistics','Social Science', 'Computer', 'Physics', 'Chemistry', 'Biology']
pd.Series(marks,index=subjects)

Maths             45
Statistics        67
Social Science    89
Computer          90
Physics           34
Chemistry         56
Biology           78
dtype: int64

name attribute in the series.

In [8]:
marks = pd.Series(marks,index=subjects, name="Marks of Zubair")
marks

Maths             45
Statistics        67
Social Science    89
Computer          90
Physics           34
Chemistry         56
Biology           78
Name: Marks of Zubair, dtype: int64

## Series from dictionary

In [9]:
dict_marks = {
    'Maths': 45,
    'Statistics': 67,
    'Social Science': 89,
    'Computer': 90,
    'Physics': 34,
    'Chemistry': 56,
    'Biology': 78
}
marks = pd.Series(dict_marks, name="Marks using dict")


## Series Attributes

In [10]:
# size
print("size =",marks.size)

# dtype
print("dtype =",marks.dtype)

# name
print("name =",marks.name)

# is_unique
print("is_unique (marks) =",marks.is_unique)
print("is_unique (runs) =",pd.Series([1,2,3,4,4,5]).is_unique)

# index
print("index (marks) =",marks.index)
print("index (runs) =",runs_ser.index)

# values
print("values =",marks.values)
print("values are numpy array =",type(marks.values))

size = 7
dtype = int64
name = Marks using dict
is_unique (marks) = True
is_unique (runs) = False
index (marks) = Index(['Maths', 'Statistics', 'Social Science', 'Computer', 'Physics',
       'Chemistry', 'Biology'],
      dtype='object')
index (runs) = RangeIndex(start=0, stop=7, step=1)
values = [45 67 89 90 34 56 78]
values are numpy array = <class 'numpy.ndarray'>


## Series using read_csv

In [11]:
# with one col
df = pd.read_csv('pandas01_subs.csv')
subs = df.squeeze()
subs
# without squeeze it will be a dataframe. with squeeze it will be a series


0       48
1       57
2       40
3       43
4       44
      ... 
360    231
361    226
362    155
363    144
364    172
Name: Subscribers gained, Length: 365, dtype: int64

In [12]:
# with two cols
kohli_runs = pd.read_csv("pandas01_kohli_ipl.csv", index_col='match_no').squeeze()
kohli_runs

match_no
1       1
2      23
3      13
4      12
5       1
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 215, dtype: int64

In [13]:
movies = pd.read_csv('pandas01_bollywood.csv', index_col='movie').squeeze()
movies

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

## Series Methods

In [14]:
# head and tail
print("head -----------------")
print(movies.head(3))
print("tail -----------------")
print(movies.tail())


head -----------------
movie
Uri: The Surgical Strike                Vicky Kaushal
Battalion 609                             Vicky Ahuja
The Accidental Prime Minister (film)      Anupam Kher
Name: lead, dtype: object
tail -----------------
movie
Hum Tumhare Hain Sanam      Shah Rukh Khan
Aankhen (2002 film)       Amitabh Bachchan
Saathiya (film)               Vivek Oberoi
Company (film)                  Ajay Devgn
Awara Paagal Deewana          Akshay Kumar
Name: lead, dtype: object


In [15]:
# sample ; gives random rows
movies.sample(4)   # gives `one` random row by default

movie
Alag                    Akshay Kapoor
Don Muthu Swami    Mithun Chakraborty
Ragini MMS 2              Sunny Leone
Saathiya (film)          Vivek Oberoi
Name: lead, dtype: object

In [16]:
# value_counts ; gives frequency of each value
movies.value_counts()    # tells how many movies each actor(value) has done
# In other words, how many times each value has occured or frequency of each value

lead
Akshay Kumar        48
Amitabh Bachchan    45
Ajay Devgn          38
Salman Khan         31
Sanjay Dutt         26
                    ..
Diganth              1
Parveen Kaur         1
Seema Azmi           1
Akanksha Puri        1
Edwin Fernandes      1
Name: count, Length: 566, dtype: int64

In [17]:
# sort_values
sorted_runs = kohli_runs.sort_values(ascending=False)    # sorting does not change original series
print(sorted_runs.head(1))
print(sorted_runs.head(1).values[0])


match_no
128    113
Name: runs, dtype: int64
113


In [18]:
# inplace
new_runs = kohli_runs.copy()    # to avoid warning
new_runs.sort_values(inplace=True)    # inplace=True will change the original series
new_runs

match_no
87       0
211      0
207      0
206      0
91       0
      ... 
164    100
120    100
123    108
126    109
128    113
Name: runs, Length: 215, dtype: int64

In [19]:
# sort_index
movies.sort_index()  # sorts by index

movie
1920 (film)                   Rajniesh Duggall
1920: London                     Sharman Joshi
1920: The Evil Returns             Vicky Ahuja
1971 (2007 film)                Manoj Bajpayee
2 States (2014 film)              Arjun Kapoor
                                   ...        
Zindagi 50-50                      Veena Malik
Zindagi Na Milegi Dobara        Hrithik Roshan
Zindagi Tere Naam           Mithun Chakraborty
Zokkomon                       Darsheel Safary
Zor Lagaa Ke...Haiya!            Meghan Jadhav
Name: lead, Length: 1500, dtype: object

## Series Maths Methods

In [23]:
# count
print("Number of non-null values in movies =",movies.count())  # count of non-null values, while size gives count of all values including null

Number of non-null values in movies = 1500


In [24]:
# sum
print("Total Runs by Kohli =",kohli_runs.sum())

Total Runs by Kohli = 6634


In [25]:
# product 
print("Product of all marks =",marks.product())

Product of all marks = 3586587076800


In [26]:
# mean, median, mode, std, var
print("Mean of runs =",kohli_runs.mean())
print("Median of runs =",kohli_runs.median())
print("Mode of runs =",kohli_runs.mode())
print("Standard Deviation of runs =",kohli_runs.std())
print("Variance of runs =",kohli_runs.var())


Mean of runs = 30.855813953488372
Median of runs = 24.0
Mode of runs = 0    0
Name: runs, dtype: int64
Standard Deviation of runs = 26.22980132830278
Variance of runs = 688.0024777222343


In [28]:
# min, max
print("Minimum runs =",subs.min())
print("Maximum runs =",subs.max())

Minimum runs = 33
Maximum runs = 396


In [30]:
# describe
kohli_runs.describe()

count    215.000000
mean      30.855814
std       26.229801
min        0.000000
25%        9.000000
50%       24.000000
75%       48.000000
max      113.000000
Name: runs, dtype: float64

## Series Indexing

In [None]:
# integer indexing
x= pd.Series([12,13,14,35,46,57,58,79,9])