**required libraries** -> pandas, openpyxl
- run this command in the terminal
    ```
    pip install pandas openpyxl --upgrade
    ```

Pandas Concepts to be covered:
- Loading Data into Pandas
- Series and DataFrames
- Viewing Data
    - head()
    - tail()
    - sample()
- Selection 
    - single column
    - multiple columns
    - Row Selection by Label
        - loc
    - Row Selection by Position
        - iloc
- Data Manipulation
    - Adding Columns
    - Removing Columns
    - Renaming Columns
    - Replacing Values
    - Applying Functions
- Data Cleaning
    - Handling Missing Data
    - Handling Duplicates
    - Handling Outliers
    - Handling Incorrect Data Types
    - Handling Inconsistent Data Entry
- Grouping and Aggregating
    - Grouping
    - Aggregating
    - Applying Functions
- Sorting
    - Sorting by Index
    - Sorting by Values
- Data Visualization
    - Matplotlib
    - Seaborn
    - plotly


In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
file = 'Canada.xlsx'
canada = pd.read_excel(file, sheet_name=1, skiprows=20, skipfooter=2)

In [None]:
canada['AreaName'] # series - only 1 column (1 Dimensional)

In [None]:
canada # data frame - 2D data structure

In [None]:
canada.head() # first 5

In [None]:
canada.head(2) # first 2

In [None]:
canada.tail()

In [None]:
canada.sample(5) # random sample of 5 rows

In [None]:
canada[5:10] # slice of dataset

In [None]:
canada['RegName'] # dict-like access to a Series

In [None]:
canada.RegName # object style access

In [None]:
cols = list(range(1980, 1991))
canada[cols]

In [None]:
cols = ['AreaName','RegName',2013]
canada[cols]

In [None]:
canada[['AreaName','RegName',2013]]

In [None]:
canada.set_index('OdName', inplace=True)

In [None]:
canada

In [None]:
canada.iloc[0] # row index 0

In [None]:
canada.iloc[100]

In [None]:
canada.iloc[10:20] # 10 -19 rows

In [None]:
canada.iloc[10:16, :10] # 10-15 rows, 0-9 columns

In [None]:
canada.loc['Japan'] # row that has the index 'Japan'

In [None]:
canada.loc[['Japan','France']]

In [None]:
canada.loc[['Japan','France'], [1980, 1981, 1982, 1983, 1984, 1985]]

In [None]:
vs = canada.loc[['Japan','France','India'], [1980, 1981, 1982, 1983, 1984, 1985]]

adding a total column

In [None]:
# adding a total column
years = list(range(1980, 2014))
canada[years].sum(axis=1)

In [None]:

canada['Total'] = canada[years].sum(axis=1) # axis=1 means sum across columns (horizontally)
canada.head()

In [None]:
canada['Faltu'] = 20
canada.head(3)

removing a column

In [None]:
canada.drop(columns='Faltu',inplace=True)
canada.head(2)

removing multiple columns

In [None]:
cols_to_drop = ['Type','Coverage','AREA','REG','DEV']
canada.drop(columns=cols_to_drop, inplace=True)
canada.head(2)

renaming columns

In [None]:
canada.rename(mapper={
    'AreaName': 'Continent',
    'RegName': 'Region',
    'DevName': 'Status'
}, axis=1, inplace=True)
canada.head(2)

replacing data

In [None]:
canada.replace('Developing regions', 0, inplace=True)
canada.replace('Developed regions', 1, inplace=True)
canada.head()

applying lambda operation on column

In [None]:
canada['Total'].apply(lambda x: x > 50000).value_counts()

applying a function operation to a dataframe column

In [None]:
import seaborn as sns

penguins = sns.load_dataset("penguins")

In [None]:
pd.cut(penguins['body_mass_g'], 3).unique()

In [None]:
def categorize_by_weight(value):
    if value == np.nan:
        return np.nan
    elif  value <= 3900 :
        return 'Light weight'
    elif value > 3900 and value <= 5100:
        return 'Medium weight'
    elif value > 5100:
        return 'Heavy weight'

# categorize_by_weight(4000)
penguins['category'] = penguins['body_mass_g'].apply(categorize_by_weight)
penguins.head()    

- Data Cleaning
    - Handling Missing Data
    - Handling Duplicates
    - Handling Incorrect Data Types
    - Handling Inconsistent Data Entry
- Grouping and Aggregating
    - Grouping
    - Aggregating
    - Applying Functions
- Sorting
    - Sorting by Index
    - Sorting by Values

In [None]:
import plotly.express as px

In [None]:
penguins

In [None]:
penguins.groupby('species').get_group('Gentoo')

In [None]:
penguins.groupby('species')['bill_length_mm'].max()

In [None]:
penguins.groupby('species')[['bill_length_mm','body_mass_g']].max()

In [None]:
penguins.groupby('species')[['bill_length_mm','body_mass_g']].sum()

In [None]:
# numpy print options
np.set_printoptions(precision=2)
pd.set_option('display.precision', 2)

In [None]:
penguins.groupby(['species','island','sex'])[['bill_length_mm',
                                               'body_mass_g']].mean().reset_index()

In [None]:
penguins.pivot_table(values='body_mass_g', index='species', columns='island')

In [None]:
pp = penguins.pivot_table(values='body_mass_g', index='species', columns='island',
                     aggfunc='sum')
pp

In [None]:
pp.replace(np.nan, 0, inplace=True)
pp = pp.astype(int)
pp

In [None]:
groupdf = penguins.groupby(['species','island','sex'])[['bill_length_mm',
                                               'body_mass_g']].mean().reset_index()

In [None]:
groupdf.sort_values('body_mass_g')

In [None]:
groupdf.sort_values('body_mass_g', ascending=False) # descending order

In [None]:
sns.get_dataset_names()

In [None]:
car_crashes = sns.load_dataset('car_crashes')
car_crashes.head()

In [None]:
car_crashes.drop(columns=['abbrev'], inplace=True)

filter

In [None]:
car_crashes.sort_values(by='alcohol', ascending=False, inplace=True)

In [None]:
car_crashes[car_crashes['alcohol'] > 5]