## Advanced Pandas

In [None]:
# import libraries
import pandas as pd
import numpy as np

In [152]:
# pd.set_option('display.max_rows', None)  # Show all rows
# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.width', None)  # Disable line wrapping
# pd.set_option('display.max_colwidth', None)  # Show full column width
# pd.set_option('display.float_format', '{:.6f}'.format)  # To display floats without scientific notation

- Read CSV file

In [None]:
data = pd.read_csv('startup_funding.csv', thousands=',')
data.head()

In [None]:
data.shape

In [None]:
data.describe()

## Making a Copy of Data

It is important to make a copy of data first, as in future if we corrupt the df by accident then we must have a copy to read data.

In [156]:
# this copy() method made a seperate copy
df = data.copy()

In [157]:
# it point out the data
# they both are same - its just like a pointer
df1 = data

## Basic Data Explortion

#### Getting Shape of Data

In [None]:
df.shape

#### Getting a list of all columns in dataframe

In [None]:
df.columns

#### Checking data types of all columns

In [None]:
df.dtypes

#### Getting Top/Bottom 5 Values

In [None]:
df.head(2)

In [None]:
df.tail(2)

#### Getting Summary of all columns

In [None]:
df.describe()

In [None]:
# display the basic stats of object type
df.describe(include='O')

In [None]:
# display all the column stats
df.describe(include='all')

#### Getting Unique values of single column

In [None]:
df['InvestmentType'].nunique() # returns numbers of unique values

In [None]:
df['InvestmentType'].unique()

In [None]:
df['InvestmentType'].value_counts()

#### Missing Values

Checking which columns have missing values

In [None]:
df.isnull().sum()

- Filling missing values

In [None]:
df.head(3)

In [171]:
# fill missing values
# df.fillna(0, inplace=True)

In [None]:
df['AmountInUSD'].fillna(0, inplace=True)

In [None]:
df.head(3)

#### Treating all missing values at once

In [174]:
# simply give Others argument and it will fill all the missing values
df.fillna('Others', inplace=True)

In [None]:
df.isnull().sum()

### Duplicates

`duplicated()` returned the duplicated values and `sum()` count all the duplicated values

In [None]:
df['StartupName'].duplicated().sum()

In [None]:
df[df['StartupName'] == 'Ola']

In [178]:
# display all the duplicated rows
# df[df['StartupName'].duplicated()]

### Checking for whole row duplicates 

In [None]:
df[df.duplicated()]

#### Deleting Duplicates from specific column

In [None]:
df.shape

In [None]:
df.drop_duplicates(['StartupName'], keep='first').shape # default is first

In [None]:
df.drop_duplicates(['StartupName'], keep='last').shape # keeps the last instance and delete the rest

In [None]:
df.drop_duplicates(['StartupName'], keep=False).shape # deletes all the occurences of duplicates so none is left

### Delete whole row duplicates

In [None]:
# deletes whole row duplicates
df.drop_duplicates().shape

In [None]:
df.shape

- Now we will use inplace=True to save the dataframe

In [None]:
df.drop_duplicates(inplace=True)
df.shape

### Selecting / Dropping Columns

- **Selecting Columns**

In [None]:
df[['Date','CityLocation', 'AmountInUSD']]

- **Dropping Columns**

In [188]:
df['index'] = 999

In [None]:
df.columns

In [190]:
df.drop('index', axis=1, inplace=True)

In [None]:
df.columns

### String Operations on Whole Columns

In [None]:
df['InvestmentType'].unique()

- **String Replacement**

In [None]:
# first parameter in str.replace is one which we want to replace and
# the second one is with which we are replacing it
df['InvestmentType'].replace('PrivateEquity', 'Private Equity').unique()

In [None]:
df['InvestmentType'].unique()

In [195]:
df['InvestmentType'] = df['InvestmentType'].replace('PrivateEquity', 'Private Equity')

In [None]:
df['InvestmentType'].unique()

In [197]:
df['InvestmentType'] = df['InvestmentType'].replace('SeedFunding', 'Seed Funding')
df['InvestmentType'] = df['InvestmentType'].replace('Crowd funding', 'Crowd Funding')

In [None]:
df['InvestmentType'].unique()

### Capitalization

In [None]:
df['CityLocation'].value_counts()
df['CityLocation'].unique()

In [200]:
df['CityLocation'].str.upper()
df['CityLocation'].str.lower()
df['CityLocation'].str.title()
df['CityLocation'] = df['CityLocation'].str.title() # save to dataframe

In [None]:
df.head(2)

### Checking if there is a substring in each column value

In [None]:
s = 'Dice'
s.find('ice')

In [None]:
df['InvestorsName'].str.contains('Khan').value_counts()

In [None]:
df[df['InvestorsName'].str.contains('Khan')]

### Dataframe.at vs Dataframe.loc

In [None]:
df.loc[133, ]

In [206]:
df.loc[133, 'IndustryVertical'] = 'abc'

In [207]:
df.at[133, 'IndustryVertical'] = 'Technology'

In [None]:
df.loc[133]

In [209]:
# replace the value of startup name at row 133
df.at[133, 'StartupName'] = 'Zubair Labs'

In [None]:
df.loc[133]

In [None]:
# at is fast as compare to loc
%timeit df.at[133, 'StartupName']

In [None]:
%timeit df.loc[133, 'StartupName']

## **Joins**

In [None]:
rows1 = df[0:10]
rows2 = df[5:12]
print(rows1.shape)
rows1
table1 = rows1[['SNo', 'Date', 'CityLocation']]
table2 = rows2[['SNo', 'StartupName', 'IndustryVertical', 'InvestmentType']]
print(table1.shape)
print(table2.shape)
table1

- table1 merge with table2 **merge(table, how='right', on='SNo')**
- Most simply, we can explicitly specify the name of the key column using the **on** keyword.
- **how** keyword are **'inner', 'outer', 'left' and 'right'**
- An outer join returns a join over the union of the input columns, and fills in all missing values with NaN 

In [None]:
table1.merge(table2, how='right', on='SNo')

### **Sorting**

In [None]:
df.sort_values(by=['CityLocation'])

### **Concatenation**

In [None]:
df.columns

In [None]:
part1 = df[['SNo', 'Date', 'StartupName']]
part2 = df[['InvestorsName', 'InvestmentType']]
print(part1.shape)
part1.head()

In [None]:
print(part2.shape)
part2.head()

In [None]:
pd.concat([part1, part2], axis=1)

In [2]:
rows1 = df[:3]
rows2 = df[5:7]
print(rows1.shape)
rows1

In [4]:
print(rows2.shape)
rows2

In [None]:
pd.concat([rows1, rows2], axis=0)