## Pandas
- Is a robust python package for data analysis and manipulation
- Pandas offers 2 primary data structures:
   - Series: labelled array in one dimension.
   - Dataframe: labelled two-dimensional data structure similar to a spreadsheet or label.
- Dataframe: Features labeled rows and columns.
  - A datafrmae is comparable to an SQL Table/Excel sheet

In [2]:
import pandas as pd

## Reading a csv file

In [39]:
df = pd.read_csv("sample_data.csv")

In [4]:
# showing first 5 rows
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Elwin,Orwell,eorwell0@amazon.de,Male,7.235.208.59
1,2,Cassie,Brookesbie,cbrookesbie1@liveinternet.ru,Male,175.156.141.54
2,3,Alard,Vouls,avouls2@163.com,Male,51.94.241.193
3,4,Giraud,Cajkler,gcajkler3@java.com,Male,178.123.37.71
4,5,Shelli,Husbands,shusbands4@answers.com,Female,133.180.254.33


# Exploring data

In [5]:
# Showing the last 5 rows
df.tail()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
995,996,Kerrin,Greetham,kgreethamrn@reverbnation.com,Female,22.229.189.74
996,997,Debee,Emblin,demblinro@cisco.com,Genderqueer,195.93.16.101
997,998,Puff,Heersema,pheersemarp@eventbrite.com,Male,173.66.97.152
998,999,Brena,Gummer,bgummerrq@dailymail.co.uk,Female,141.165.113.191
999,1000,Minny,MacElroy,mmacelroyrr@independent.co.uk,Female,212.71.158.170


In [8]:
# view the total rows & coloumns of the dataset
df.shape

(1000, 6)

## summary of the dataframe

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          1000 non-null   int64 
 1   first_name  1000 non-null   object
 2   last_name   1000 non-null   object
 3   email       1000 non-null   object
 4   gender      1000 non-null   object
 5   ip_address  1000 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


## Getting the statistical summary of the dataset

In [10]:
df.describe()

Unnamed: 0,id
count,1000.0
mean,500.5
std,288.819436
min,1.0
25%,250.75
50%,500.5
75%,750.25
max,1000.0


In [12]:
# Selecting the couloumns and rows
# df['column_name']
df['first_name']

0       Elwin
1      Cassie
2       Alard
3      Giraud
4      Shelli
        ...  
995    Kerrin
996     Debee
997      Puff
998     Brena
999     Minny
Name: first_name, Length: 1000, dtype: object

In [13]:
# lets say I want the emails
df["email"]

0                 eorwell0@amazon.de
1       cbrookesbie1@liveinternet.ru
2                    avouls2@163.com
3                 gcajkler3@java.com
4             shusbands4@answers.com
                   ...              
995     kgreethamrn@reverbnation.com
996              demblinro@cisco.com
997       pheersemarp@eventbrite.com
998        bgummerrq@dailymail.co.uk
999    mmacelroyrr@independent.co.uk
Name: email, Length: 1000, dtype: object

In [14]:
# How to select multiple coloumns
# syntax -> df[['col1', 'col2']]
df[['email', 'ip_address']]

Unnamed: 0,email,ip_address
0,eorwell0@amazon.de,7.235.208.59
1,cbrookesbie1@liveinternet.ru,175.156.141.54
2,avouls2@163.com,51.94.241.193
3,gcajkler3@java.com,178.123.37.71
4,shusbands4@answers.com,133.180.254.33
...,...,...
995,kgreethamrn@reverbnation.com,22.229.189.74
996,demblinro@cisco.com,195.93.16.101
997,pheersemarp@eventbrite.com,173.66.97.152
998,bgummerrq@dailymail.co.uk,141.165.113.191


In [15]:
# Row showing by index, more like showing one individuals info, e.f first row
df.iloc[2]

id                          3
first_name              Alard
last_name               Vouls
email         avouls2@163.com
gender                   Male
ip_address      51.94.241.193
Name: 2, dtype: object

In [16]:
## Filtering the rows
df[df['id'] > 45]

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
45,46,Simmonds,Blei,sblei19@google.com.br,Male,169.33.45.9
46,47,Janessa,Ashbey,jashbey1a@edublogs.org,Female,104.146.214.243
47,48,Vaughn,Casaro,vcasaro1b@ask.com,Male,15.197.58.111
48,49,Bentley,Wardale,bwardale1c@toplist.cz,Male,157.253.195.5
49,50,Nady,Rustidge,nrustidge1d@omniture.com,Female,134.10.122.204
...,...,...,...,...,...,...
995,996,Kerrin,Greetham,kgreethamrn@reverbnation.com,Female,22.229.189.74
996,997,Debee,Emblin,demblinro@cisco.com,Genderqueer,195.93.16.101
997,998,Puff,Heersema,pheersemarp@eventbrite.com,Male,173.66.97.152
998,999,Brena,Gummer,bgummerrq@dailymail.co.uk,Female,141.165.113.191


In [19]:
## Filtering the rows using multple conditions
df[(df['id'] > 567) & (df['gender'] == 'Male')]

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
568,569,Nealson,Tyas,ntyasfs@free.fr,Male,76.74.55.205
571,572,Riobard,Gamlyn,rgamlynfv@elegantthemes.com,Male,182.247.205.213
573,574,Ricky,Soughton,rsoughtonfx@google.pl,Male,69.2.68.31
575,576,Manfred,Dispencer,mdispencerfz@myspace.com,Male,187.236.131.198
580,581,Guthry,Smellie,gsmellieg4@alexa.com,Male,182.55.34.56
...,...,...,...,...,...,...
983,984,Clemente,Greg,cgregrb@so-net.ne.jp,Male,158.65.158.194
984,985,Madison,Carrick,mcarrickrc@indiatimes.com,Male,209.155.150.76
985,986,Gun,Gencke,ggenckerd@yellowbook.com,Male,73.8.200.131
989,990,Damien,Midford,dmidfordrh@uiuc.edu,Male,145.234.130.190


## Adding a new coloumn

In [20]:
df['NewCol'] = df['id'] + 2

In [21]:
df['id'] = df['id'] * 2

In [22]:
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address,NewCol
0,2,Elwin,Orwell,eorwell0@amazon.de,Male,7.235.208.59,3
1,4,Cassie,Brookesbie,cbrookesbie1@liveinternet.ru,Male,175.156.141.54,4
2,6,Alard,Vouls,avouls2@163.com,Male,51.94.241.193,5
3,8,Giraud,Cajkler,gcajkler3@java.com,Male,178.123.37.71,6
4,10,Shelli,Husbands,shusbands4@answers.com,Female,133.180.254.33,7


## Removing a coloumn

In [23]:
df.drop(0, axis=0, inplace=True)

In [26]:
df = pd.read_csv('sample_data.csv')

In [27]:
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Elwin,Orwell,eorwell0@amazon.de,Male,7.235.208.59
1,2,Cassie,Brookesbie,cbrookesbie1@liveinternet.ru,Male,175.156.141.54
2,3,Alard,Vouls,avouls2@163.com,Male,51.94.241.193
3,4,Giraud,Cajkler,gcajkler3@java.com,Male,178.123.37.71
4,5,Shelli,Husbands,shusbands4@answers.com,Female,133.180.254.33


In [28]:
## Sorting the values
# soriting the gender in ascending order

df.sort_values('gender', ascending=True)

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
169,170,Friederike,Leggs,fleggs4p@cornell.edu,Agender,10.6.80.142
507,508,Peggi,Grossier,pgrossiere3@diigo.com,Agender,67.23.92.204
269,270,Bordie,Pantecost,bpantecost7h@opera.com,Agender,187.244.139.161
67,68,Freedman,MacCleod,fmaccleod1v@merriam-webster.com,Agender,77.118.34.155
755,756,Parke,Pikesley,ppikesleykz@spiegel.de,Agender,22.107.61.140
...,...,...,...,...,...,...
456,457,Katleen,Loram,kloramco@altervista.org,Polygender,119.225.141.193
154,155,Eolanda,Radbone,eradbone4a@oracle.com,Polygender,187.28.123.83
598,599,Julietta,Cursons,jcursonsgm@cam.ac.uk,Polygender,123.49.191.86
356,357,Corrie,Norwell,cnorwell9w@sbwire.com,Polygender,72.72.63.119


## Checking or handling missing data

In [31]:
df.isnull().sum() #Looking for null values

id            0
first_name    0
last_name     0
email         0
gender        0
ip_address    0
dtype: int64

In [32]:
# Droping null values
df.dropna(inplace=True)

In [33]:
# Filling null values if we have missing values with 0
df.fillna(0, inplace=True)

In [34]:
# exporting data to csv
df.to_csv('output.csv', index=False)

In [37]:
df = pd.read_csv('sample_data.csv')

In [38]:
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Elwin,Orwell,eorwell0@amazon.de,Male,7.235.208.59
1,2,Cassie,Brookesbie,cbrookesbie1@liveinternet.ru,Male,175.156.141.54
2,3,Alard,Vouls,avouls2@163.com,Male,51.94.241.193
3,4,Giraud,Cajkler,gcajkler3@java.com,Male,178.123.37.71
4,5,Shelli,Husbands,shusbands4@answers.com,Female,133.180.254.33
