In [None]:
!pip install pandas

# pandas for analysis
- datasets
  - dictionary of items
  - list of dictionary
  - csv file
  - excel file
  - json file
  - sql connection
- creating dataframe
- reading data 
- selection of data
- manipulating data
- cleaning data
- visualizing data
- saving data

In [None]:
import pandas as pd

In [None]:
books = {
    'title': ['The art of war', 'The final empire'],
    'author': ['Sun Tzu','Brandon Sanderson'],
    'price': [399, 1099],
}
pd.DataFrame(books)

In [None]:
movies = [
    {'title':"PK", 'director':"Rajkumar Hirani", 'year':2014},
    {'title':"I", 'director':"Shankar", 'year':2015},
    {'title':"Pink", 'director':"Aniruddha Roy Chowdhury", 'year':2016},
]
pd.DataFrame(movies)

In [None]:
pd.read_csv('dummy.csv')

In [None]:
pd.read_json('dummy.json')

In [None]:
pd.read_excel(r'C:\Users\ZAID\Documents\dummy.xlsx')

In [None]:
url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/regression/kc_house_data.csv'
df = pd.read_csv(url)
df

first steps after reading the data
- `df.head(nrows)` - first n rows
- `df.tail(nrows)` - last n rows
- `df.sample(nrows)` - random n rows
- `df.info()` - data types, missing values
- `df.describe()` - summary statistics
- `df.shape` - number of rows and columns
- `df.columns` - column names

In [None]:
df.head(2)

In [None]:
df.tail()

In [None]:
df.sample(3)

In [None]:
df.info()

In [None]:
df.describe() # summary statistics for numerical columns (default)

In [None]:
df.describe(include='object') # summary statistics for object columns

In [None]:
df.shape

In [None]:
print(df.columns.tolist())

In [None]:
# simple sort
df.sort_values('bedrooms')

# selecting data in pandas
- dataframe
  - rows
    - iloc - index based location
    - loc - label based location
  - columns
    - single column selection
    - multiple column selection
    - column selection by data type
  - conditional selection
    - boolean indexing
    - query method
  

In [None]:
df

In [None]:
# select rows from iloc
df.iloc[100] # idx 100 row

In [None]:
# random 5 rows
df.iloc[[1,59,100,200,300]] # idx [1, 59, 100, 200, 300] rows

In [None]:
# random 5 rows, with selected columns
df.iloc[[1,10,20], [0,11,12,3,4]]

In [None]:
df.iloc[15:50, :10] # idx 15-49 rows, 0-9 columns

In [None]:
# selecting columns based on indexes
df.iloc[:, [0,5,10,11]]

In [None]:
# updating index column
df = df.set_index('id')
df.head()

In [None]:
df.loc[[1954400510, 7202330790]]

### single column selection
- dictionary like selection
  - `dataframe['column_name']`
- object like selection
  - `dataframe.column_name`

### multiple column selection
- dictionary like selection
  - `dataframe[list_of_column_names]`

In [None]:
df['price']

In [None]:
df.bedrooms

In [None]:
df[['bedrooms','price','sqft_living']]

In [None]:
col_list = ['grade','condition','yr_built']
df[col_list]

# boolean indexing
`df['column] > value`

`df['column] == value`

`df['column] != value`

In [None]:
df[df['price'] > 1000000]

In [None]:
# 5 star rated houses + 2 floors
df[(df['condition']==5) & (df['floors'] == 2)]

In [None]:
f1 = df['price'] > 1000000
f2 = df['condition'] == 5
f3 = df['floors'] == 2
df[f1 & f2 & f3]

In [None]:
df[f1 & f2 & f3].reset_index().style.background_gradient(cmap='GnBu')

In [None]:
df[f1 & f2 & f3].reset_index().style.set_properties(color='red', subset=['price'])

In [None]:
# query
df.query('price > 2500000')

In [None]:
df.query('price > 2500000 and condition == 5')

In [None]:
df.query('price > 2500000 and condition == 5').sort_values('price')

### Manipulation and Cleaning data
- removing columns n rows
- renaming columns
- adding columns
- changing data types
- handling missing values
- dropping missing values
- using `apply()` method

In [None]:
df.columns

In [None]:
cols_to_drop = ['sqft_living15', 'sqft_lot15']
df = df.drop(columns=cols_to_drop)
df

In [None]:
df.drop(labels=[7129300520, 6414100192, 5631500400])

In [None]:
df.drop_duplicates(subset=['date']) # always use a subset column

In [None]:
df.dropna() # remove rows with any NaN values

In [None]:
# renaming columns
df.columns

In [None]:
name_update = {
    'lat': 'latitude',
    'long': 'longitude',
}
df.rename(columns=name_update)

In [None]:
# dummy column add
df['dummy'] = "house_price" # repeats same value in all rows
df

In [None]:
df['age'] = 2024 - df['yr_built'] # add a new column age
df.head()

In [None]:
df = df.drop(columns=['dummy'])

In [None]:
# rearranging columns - first copy the columns
print(df.columns.tolist())

In [None]:
# paste the list and rearrange columns as per your choice
columns_seq = ['date', 'bedrooms', 'bathrooms',  
            'floors', 'waterfront', 'view',  
            'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement',
            'yr_built', 'yr_renovated','age','condition', 'grade', 
            'zipcode', 'lat', 'long', 'price']
df = df[columns_seq].copy()  # copy the rearranged columns to df
df.head()

In [None]:
df.floors.dtype

In [None]:
df.floors.astype('int') # convert to int

In [None]:
df.floors = df.floors.astype('int')
df.head()

In [None]:
import numpy as np

In [None]:
fake_df = pd.DataFrame({
    'A': [1,2,3,4,5],
    'B': [6,7,np.nan,9,10],
    'C': [11,12,13,14,None],
    'D': [16,17,np.nan,20, np.nan],
})
fake_df

In [None]:
fake_df.dropna() # should not be done - rookie mistake

In [None]:
df.isnull().sum()

In [None]:
fake_df.isnull().sum()

In [None]:
fake_df

In [None]:
fake_df.fillna(0)

In [None]:
b_mean = fake_df['B'].mean()
fake_df.B = fake_df.B.replace(np.nan, b_mean)
fake_df

In [None]:
fake_df.D = fake_df.D.fillna(fake_df.D.mean()) # fill with mean
fake_df