# Pandas and NumPy Examples

In [1]:
# Importing libraries
import pandas as pd
import numpy as np

In [2]:
# Creating a DataFrame from a dictionary
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 22]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,22


In [3]:
df['Bob']

KeyError: 'Bob'

In [4]:
Name= ['Alice', 'Bethaney', 'Charlie']
Age= [25, 30, 22]
df1 = pd.Series(data = Age, index= Name)
df1

Name= ['Alice', 'Bob', 'Charlie']
Age= [25, 30, 22]
df2 = pd.Series(data = Age, index= Name)
df2
df1+df2

Alice       50.0
Bethaney     NaN
Bob          NaN
Charlie     44.0
dtype: float64

In [5]:
df2['Bob']

30

## Basic DataFrame Operations

In [6]:
# Displaying the first few rows
df.head()

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,22


In [7]:
# Accessing a specific column
df['Age']

0    25
1    30
2    22
Name: Age, dtype: int64

In [8]:
# Adding a new column
df['Gender'] = ['Female', 'Male', 'Male']
df

Unnamed: 0,Name,Age,Gender
0,Alice,25,Female
1,Bob,30,Male
2,Charlie,22,Male


In [12]:
# Filtering rows based on a condition
df[df['Age'] > 25]

Unnamed: 0,Name,Age,Gender,New
1,Bob,30,Male,40


In [23]:
df['New'] = df['Age'] + 10
df

Unnamed: 0,Name,Age,Gender,New
0,Alice,25,Female,35
1,Bob,30,Male,40
2,Charlie,22,Male,32


In [16]:
df.drop('New',axis = 0) #row delete axis = 0

KeyError: "['New'] not found in axis"

In [18]:
df.drop('New',axis = 1) #column delete axis = 1

Unnamed: 0,Name,Age,Gender
0,Alice,25,Female
1,Bob,30,Male
2,Charlie,22,Male


In [19]:
df

Unnamed: 0,Name,Age,Gender,New
0,Alice,25,Female,35
1,Bob,30,Male,40
2,Charlie,22,Male,32


In [24]:
df.drop('New',axis = 1, inplace=True) #permanantly delete column

In [21]:
df

Unnamed: 0,Name,Age,Gender
0,Alice,25,Female
1,Bob,30,Male
2,Charlie,22,Male


In [32]:
df[['Name','Age']]

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,22


In [29]:
df.loc[1] #Index name calling

Name       Bob
Age         30
Gender    Male
Name: 1, dtype: object

In [30]:
df.iloc[1] #Index Number calling

Name       Bob
Age         30
Gender    Male
Name: 1, dtype: object

## Indexing and Slicing

In [33]:
# Accessing a specific cell
df.at[0, 'Name']

'Alice'

In [36]:
# Slicing rows and columns
row_slice = df.iloc[1:3, :]
col_slice = df.loc[:, 'Name':'Age']
print(row_slice)
print('\n')
print(col_slice)

      Name  Age Gender
1      Bob   30   Male
2  Charlie   22   Male


      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   22


In [None]:
# Boolean indexing
boolean_slice = df[df['Gender'] == 'Male']

## Grouping and Aggregation

In [37]:
# Grouping by a column and calculating mean
df.groupby('Gender')['Age'].mean()

Gender
Female    25.0
Male      26.0
Name: Age, dtype: float64

In [38]:
# Aggregating multiple functions
df.groupby('Gender')['Age'].agg(['mean', 'std', 'count'])

Unnamed: 0_level_0,mean,std,count
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,25.0,,1
Male,26.0,5.656854,2


## Data Cleaning

In [None]:
# Handling missing values
df.dropna()  # Drop rows with missing values
df.fillna(0)  # Fill missing values with a specific value

In [None]:
# Removing duplicate rows
df.drop_duplicates()

## Merging and Joining DataFrames

In [None]:
# Concatenating DataFrames vertically
pd.concat([df1, df2], axis=0)

In [None]:
# Merging DataFrames based on a common column
pd.merge(left_df, right_df, on='common_column')

## Pivot Tables and Cross-Tabulations

In [None]:
# Creating a pivot table
df.pivot_table(values='Value', index='Category', columns='Date', aggfunc='sum')

In [None]:
# Creating a cross-tabulation
pd.crosstab(df['Gender'], df['Category'])

## Time Series Operations

In [None]:
# Converting column to datetime type
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# Resampling time series data
resampled_df = df.resample('D', on='Date').sum()

## Plotting

In [39]:
# Line plot
df.plot(x='Date', y='Value', kind='line')

KeyError: 'Date'

In [None]:
# Bar plot
df.plot(x='Category', y='Value', kind='bar')

## Reading Excel Files

In [None]:
# Reading an Excel file
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')

## Applying Functions

In [None]:
# Applying a function to a column
def double_age(age):
    return age * 2
df['Double_Age'] = df['Age'].apply(double_age)

In [None]:
# Applying a lambda function
df['Age_Category'] = df['Age'].apply(lambda x: 'Young' if x < 30 else 'Old')

## Grouping and Pivot Visualization

In [None]:
# Visualizing grouped data
grouped.plot(kind='bar', x='Gender', y='Age', title='Average Age by Gender')

In [None]:
# Visualizing pivot table
pivot_table.plot(kind='heatmap', cmap='coolwarm', title='Pivot Table Heatmap')

# new

Pandas has a vast set of keywords and functionalities that are used for advanced data analytics. Here's a structured breakdown:

### **1. Data Loading & Saving**
- `pd.read_csv('file.csv')` – Load CSV file.
- `pd.read_excel('file.xlsx')` – Load Excel file.
- `df.to_csv('file.csv', index=False)` – Save DataFrame as CSV.
- `df.to_excel('file.xlsx', index=False)` – Save DataFrame as Excel.

### **2. Data Exploration**
- `df.head(n)` – View first `n` rows.
- `df.tail(n)` – View last `n` rows.
- `df.shape` – Get number of rows and columns.
- `df.info()` – Summary of the DataFrame.
- `df.describe()` – Summary statistics.

### **3. Data Selection & Filtering**
- `df['column_name']` – Select a column.
- `df.loc[row_index, 'column_name']` – Select specific row-column.
- `df.iloc[row_index, column_index]` – Select by position.
- `df[df['column_name'] > value]` – Filter rows with condition.
- `df.query("column_name > value")` – Query data.
  
### **4. Data Cleaning**
- `df.drop(columns=['col1', 'col2'])` – Drop columns.
- `df.dropna()` – Remove missing values.
- `df.fillna(value)` – Fill missing values.
- `df.replace({'old_value': 'new_value'})` – Replace values.
- `df.astype('datatype')` – Change column datatype.
  
### **5. Aggregation & Grouping**
- `df.groupby('column_name').mean()` – Group and aggregate.
- `df['column_name'].value_counts()` – Count unique values.
- `df.pivot_table(values='val', index='idx', columns='cols', aggfunc='sum')` – Pivot table.

### **6. Merge & Join**
- `pd.concat([df1, df2])` – Concatenate DataFrames.
- `df1.merge(df2, on='common_column', how='inner')` – Merge DataFrames.
  
### **7. Apply Functions**
- `df.apply(lambda x: x*2)` – Apply function to data.
- `df['new_col'] = df['col'].map(lambda x: x/100)` – Map function.

### **8. Time Series & Date Handling**
- `pd.to_datetime(df['date_column'])` – Convert to datetime.
- `df['date_column'].dt.year` – Extract year.
- `df.resample('M').sum()` – Resample data by month.

### **9. Advanced Operations**
- `df.sort_values(by='column_name', ascending=False)` – Sort data.
- `df.duplicated()` – Find duplicate rows.
- `df.corr()` – Compute correlation.

### **10. Export & Convert**
- `df.to_json('file.json')` – Export to JSON.
- `df.to_dict()` – Convert to dictionary.

Pandas is an extensive library with a vast number of functions for data analysis. While I can't list every single function exhaustively, I can certainly provide an even more comprehensive breakdown than before!
1. Data Loading & Storage

    pd.read_csv('file.csv') – Load CSV file.

    pd.read_excel('file.xlsx') – Load Excel file.

    pd.read_json('file.json') – Load JSON file.

    pd.read_sql('query', connection) – Load data from SQL database.

    df.to_csv('file.csv', index=False) – Save as CSV.

    df.to_excel('file.xlsx', index=False) – Save as Excel.

    df.to_json('file.json') – Save as JSON.

2. Data Exploration & Structure

    df.head(n) – View first n rows.

    df.tail(n) – View last n rows.

    df.shape – Get number of rows and columns.

    df.info() – Display DataFrame summary.

    df.describe() – Summary statistics.

    df.columns – List column names.

    df.dtypes – Check column data types.

    df.memory_usage() – Memory usage per column.

### **Selection and Filtering in Pandas**  

Selecting and filtering data in Pandas is a crucial part of data analysis. It allows you to extract specific rows, columns, or subsets of data based on conditions.

---

### **1. Selecting Data**  
Pandas provides several ways to select data from a DataFrame:  

#### **Selecting Columns**  
- `df['column_name']` → Selects a single column.
- `df[['column1', 'column2']]` → Selects multiple columns.

#### **Selecting Rows**  
- `df.iloc[row_index]` → Selects rows based on index position.
- `df.loc[row_label]` → Selects rows based on labels.

#### **Selecting Specific Rows & Columns**  
- `df.iloc[row_index, column_index]` → Selects data by position.
- `df.loc[row_label, 'column_name']` → Selects data by label.

---

### **2. Filtering Data**  
Filtering is used to extract specific rows that meet certain conditions.  

#### **Basic Filtering**
- `df[df['column_name'] > value]` → Filters rows where values in a column are greater than `value`.
- `df[df['column_name'] == 'specific_value']` → Filters rows where values match `specific_value`.

#### **Multiple Conditions**
- `df[(df['col1'] > 10) & (df['col2'] < 50)]` → Filters rows where `col1` is greater than `10` AND `col2` is less than `50`.
- `df[(df['col1'] > 10) | (df['col2'] < 50)]` → Filters rows where `col1` is greater than `10` OR `col2` is less than `50`.

#### **Using `.query()`**
- `df.query("column_name > 50 and other_column < 10")` → A more readable way to filter data using conditions.

#### **Filtering with `isin()`**
- `df[df['column_name'].isin(['A', 'B', 'C'])]` → Filters rows where values in the column are "A", "B", or "C".

#### **Filtering Missing Values**
- `df[df['column_name'].isnull()]` → Filters rows where values are `NaN`.
- `df[df['column_name'].notnull()]` → Filters rows where values are NOT `NaN`.

---

### **Example**
```python
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'Salary': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)

# Select Age column
print(df['Age'])

# Filter rows where Age > 30
print(df[df['Age'] > 30])

# Filter rows with specific names
print(df[df['Name'].isin(['Alice', 'David'])])
```

This will output:
```
0    25
1    30
2    35
3    40
Name: Age, dtype: int64

   Name  Age  Salary
2  Charlie   35  70000
3   David   40  80000

    Name  Age  Salary
0  Alice   25  50000
3  David   40  80000
```

Selection and filtering help you narrow down your dataset and extract relevant insights efficiently.