In [None]:
Stage-0: Import required modules such as pandas,numpy and so on
Stage-1: Reading File(CSV, EXCEL, Tab Seperated, Parquet, pickle, feather, JSON, HTML, XML)
         creating dataframe from data.
Stage-2: Data Understanding: Shape, Index, Columns, info, describe, head, tail, nunique, size, sample,dtypes, duplicated
            

## Imported Required libraries, ggplot style encompasses clarity in data representation, easily understandable, visually appealing
## with set_option we can set max rows or columns to be displayed or can also set the float format.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')


# Read the csv file

df=pd.read_csv("Airbnb_NYC_2019.csv\AB_NYC_2019.csv")


# Found the no of rows and columns

df.shape

## Found some NaN values, id and host_id are different

df.head(6)

df.tail(5)

## id and host_id are primary keys that contains unique values.

df.info()

## mean of price is 152, max is 10000, min is 0
## mean of no of reviews is 23, max is 629, min is 0


df.describe()

df.size

## only 3 unique values for room_type
## id is the only col with unique values
## neighbor group have only 5 distinct values

df.nunique()

## index values

df.index

## Column values

df.columns

## data types

df.dtypes

## same random state allow us to get same sample values everytime we run

df.sample(3, random_state=42)

Stage-3: Handling Null values: Remove Duplicates, Fill,Drop,Interpolate Nan or None values, isna()  
Stage-4: Save dataframe into CSV,Excel, Copy Data, Creating new df from existing df by pointing to it.  


        

## Sum of Null values per each coln
## last rvw, rvw month have higher no of missing values.
## name and hst name have very less missing values such that those rows can be dropped.

df.isnull().sum()

## Filling the NaN values with 0.00 for float data

df['reviews_per_month']=df['reviews_per_month'].fillna(0.00)

# checking to see if there are anymore NaN values

df['reviews_per_month'].isna().sum()

## Dropping the column as it doesnt seem important

df=df.drop('last_review',axis=1)

## Checking for remaining null values

df.isna().sum()

## Dropping remaining null values as they are less in number

df=df.dropna()

## checking for null values after dropping few rows and cols and filling with 0.0

df.isna().sum()

## Checking for duplicates, Duplicated returns only second row with all the same values of first row
## Duplicates are 0 on this df as id column holds all unique values
## Duplicates can be applied on subset of columns as well

df.duplicated().sum()

## duplicated can be used to find duplicates in specific column

df['host_name'].duplicated().sum()

df['host_name'].duplicated().sample(5)

## duplicated columns or rows can be dropped or removed using drop_duplicates()


df.info()

## saving cleaned df as csv file in the present folder

df.to_csv('Airbnb_cleaned_df.csv')

## Copying one df into another and creating new df from existing df

df1=df.sample(5000).copy().reset_index()

df1.shape

Stage-5: Data Selection: iloc, loc, at, iat, .,  

        

df1.head(5)

# To access 1 row and 1 column using integer location

df1.iloc[1,2]

# To access 1 row and series of columns

df1.iloc[1,2:4]

# To access series of row and 1 column

df1.iloc[1:2,4]

# To access series of row and columns

df1.iloc[1:2, 5:6]

## To access multiple rows and columns

df1.iloc[[1,4],[2,4]]

## To access row and column using index name

df1.loc[1, 'name']

## To access series of rows and column using index name

df1.loc[1:4, 'name']

## To access row and series of column using index name

df1.loc[4, 'name':'host_name']

## To access series of row and column using index name

df1.loc[1:4, 'name':'host_name']

## To access multiple rows and columns using index name in df1

df1.loc[[1,4],['name','host_name']]

## at can only access 1 row and 1 column unlike loc, which can access more than one row and column 
## at is used for faster access of one row and one column

df1.at[1,'name']

## iat can only access 1 row and 1 column unlike iloc, which can access more than one row and column 
## iat is used for faster access of one row and one column

df1.iat[1,4]

Stage-6: Data Filtering: isin, Query, between, Boolean indexing, Complex Filtering with Regex and contains, logical func(and, or)  
Stage-7: Data Transformation: lambda fun, defining function with logic  
Stage-8: Data Removal and Data Rearrangement: rename, drop, remove, rearrange columns, duplicated  
 

## isin is used to filter the data in pandas, This is used to check specific values in whole df.
## Values that we are checking should be in list
## It returns the entire df, with the boolean values.

df1.isin(['Tara & Carl'])

## To find values that doesnot contain mentioned value, use not symbol.

~df1.isin(['Tara & Carl'])

## To check values in the specific columns, include columns in dictionary and values in list.

df1.isin({'host_name':['Tara & Carl'],'host_id':[38263259]})



## Applying logical functions while filtering data, retrieve data that satisfies either one of the conditions.

df1[(df1['host_name'].isin(['Tara & Carl'])) | (df1['host_id'].isin([38263259]))]

## iloc and loc can alos be used to filter data, In this case, iloc retrieves data that satisfied the condition.

df1[(df1.iloc[:,5]=='Brooklyn')&(df1.iloc[:,4]=='Sorel')]

## Query is powerful and concise way to filter rows in pandas, It is similar to where in SQL.
## df.query('condition'), Condition can be one or multiple by including logical functions.

df1.query('host_name == "Amy"')

## Can use any relational operators such as <,>,==,!=

df1.query('price>85')

## Combine multiple conditions with logical operators such as and ,or

df1.query('host_name == "Amy" & price>85' )

## between works same as in SQL, whereas and is replaced by ',' in syntax

df1[df1['price'].between (10, 100)]

## Boolean indexing is a powerful technique for complex filtering
## This will have one or multiple conditions using string or logical operators or relational operators inside df[]

df[(df['price']>100) & (df['host_name'].isin(['Amy']))]


## Boolean indexing with loc, This gives result same as the above cell with df[]

df.loc[(df['price']>100) & (df['host_name'].isin(['Amy']))]

## Boolena indexing With iloc should contain values

df.iloc[((df['price']>100) & (df['host_name'].isin(['Amy']))).values]

## Lambda functions in pandas provides a concise way to perform quick calculations or transformations without defining a seperate function
## With lambda functions, using apply, a logic can be applied to every element in df or every element in specific column

df2=pd.DataFrame()

df2['price']=df1['price'].apply(lambda x:x+1)

df2.sample(10)

## using lambda with assign we can create a new column with logic, x refers to dataframe.

df2=df2.assign(price1= lambda x:x['price']+x['price']*2)

## applymap is used for element wise opeartions, i.e. every element in dataframe, x refers to dataframe.

df2.applymap(lambda x: x**2)

## map is used to work on a series, Here x refers to series i.e. df2['price1']

df2['price_categ']=df2['price1'].map(lambda x: 'High' if x>100 else 'Low')

## Defining function and calling it, defined function is applied using 'apply'

def sq_func(x):
    return x**2

df2['price2']= df2['price'].apply(sq_func)

## inplace is optional, if you want changes to be reflected in df then keep it true otherwise false

df2.rename(columns={'price2': 'price_2'}, inplace=True)

## using the axis parameter to rename all columns to lower case.

df.rename(str.lower, axis='columns')

## without axis, all column names can be changed to upper case this way.

df2=df2.rename(columns=str.upper)

df2

## duplicates can be dropped directly

df2.drop_duplicates()

## drop can be used to drop rows or columns and can also apply inplace to reflect changes in df
## drop can also be used to drop rows or columns only when condition is satisfied.
## inplace can be used or axis can be used in place of columns
## to drop rows, index is optional, index need not to be mentioned, if index or column is not mentioned, by default rows will be dropped

df2.drop(columns=['PRICE_2'])

## Rearranging columns by converting them into list.

cols=list(df1.columns.values)
df1=df1[cols[0:5] + [cols[10]] + cols[5:10]+cols[11:16]]
df1.columns