#### [Vaibhav Rokde](https://www.linkedin.com/in/vaibhavrokde/)

In [2]:
import pandas as pd

# Series and DataFrames

## Series:

  A Series is a one-dimensional labeled array that can hold any data type. It is similar to a column in a spreadsheet or a single variable in statistics.

In [4]:
# Create a Series from a list
data = [10, 20, 30, 40, 50]
s = pd.Series(data)
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [5]:
# Create a Series with custom index
data = [10, 20, 30, 40, 50]
index = ['A', 'B', 'C', 'D', 'E']
s = pd.Series(data, index=index)
s

A    10
B    20
C    30
D    40
E    50
dtype: int64

##  DataFrame:

  A DataFrame is a two-dimensional labeled data structure with columns of potentially different data types. It is similar to a spreadsheet or a SQL table, and it is the most commonly used pandas object.

In [10]:
# Create a DataFrame from a dictionary
data = {'Name': ['John', 'Jane', 'Mike', 'Lisa'],
        'Age': [25, 30, 35, 40],
        'City': ['New York', 'London', 'Paris', 'Tokyo']}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Jane,30,London
2,Mike,35,Paris
3,Lisa,40,Tokyo


In [12]:
# Creating DataFrame from NumPy arrays
import numpy as np
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(data, columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [15]:
import sqlite3

# Creating DataFrame from SQL query
connection = sqlite3.connect('database.db')
query = 'SELECT * FROM table'
df = pd.read_sql_query(query, connection)
print(df)
print('ok')

ok


In [None]:

# Creating DataFrame from CSV file
df = pd.read_csv('my.csv')
print(df)


# Slicing, Rows, and Columns

In [3]:
# Create a DataFrame from a dictionary
data = {'Name': ['John', 'Jane', 'Mike', 'Lisa'],
        'Age': [25, 30, 35, 40],
        'City': ['New York', 'London', 'Paris', 'Tokyo']}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Jane,30,London
2,Mike,35,Paris
3,Lisa,40,Tokyo


In [4]:
print(df.loc[0:2])  # Select rows by label index

print('#*#*'*5)

print(df.iloc[1:2])  # Select rows by integer index


   Name  Age      City
0  John   25  New York
1  Jane   30    London
2  Mike   35     Paris
#*#*#*#*#*#*#*#*#*#*
   Name  Age    City
1  Jane   30  London


In [5]:
print(df['Name'])  # Select a single column

print('#*#*'*5)

print(df[['Name', 'Age']])  # Select multiple columns


0    John
1    Jane
2    Mike
3    Lisa
Name: Name, dtype: object
#*#*#*#*#*#*#*#*#*#*
   Name  Age
0  John   25
1  Jane   30
2  Mike   35
3  Lisa   40


In [6]:
print(df.loc[1:2, 'Name'] ) # Select rows and a single column

print("#*#*"*5)

df.loc[1:2, ['Name', 'Age']]  # Select rows and multiple columns


1    Jane
2    Mike
Name: Name, dtype: object
#*#*#*#*#*#*#*#*#*#*


Unnamed: 0,Name,Age
1,Jane,30
2,Mike,35


# Logical Operations

In [8]:
mask = df['Age'] >= 35  # Boolean mask where values are greater than 10
selected_rows = df[mask]  # Select rows based on the boolean mask
selected_rows 

Unnamed: 0,Name,Age,City
2,Mike,35,Paris
3,Lisa,40,Tokyo


In [9]:
mask = (df['Age'] > 35) & (df['Name'] == 'Lisa')  # Combined condition
selected_rows = df[mask]
selected_rows 

Unnamed: 0,Name,Age,City
3,Lisa,40,Tokyo


# DataFrame Operations

## Arithmetic operations

In [29]:
df['D'] = df['A'] + df['B']  # Add two columns and store the result in a new column
df

Unnamed: 0,A,B,C,D
0,1,2,3,3
1,4,5,6,9
2,7,8,9,15


## Aggregation

In [31]:
df['A'].mean()

4.0

In [32]:
df['A'].sum()

12

In [33]:
df['A'].count()  

3

In [36]:
df['A'].value_counts()

1    1
4    1
7    1
Name: A, dtype: int64

In [37]:
df['A'].min()

1

In [38]:
df['A'].max()

7

In [39]:
df['A'].median()

4.0

In [40]:
df['A'].std()

3.0

other functions like *var()* for variance, *quantile()* for calculating percentiles, and *describe()* to generate summary statistics of a DataFrame. Moreover, pandas provides the flexibility to apply custom aggregation functions using *agg()* or *apply()* methods.

## Filtering

### Filtering Rows

In [10]:
condition = df['Age'] > 10  # Define a condition
filtered_df = df[condition]
filtered_df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Jane,30,London
2,Mike,35,Paris
3,Lisa,40,Tokyo


In [11]:
condition1 = df['Age'] > 10
condition2 = df['Name'] == 'Lisa'
filtered_df = df[condition1 & condition2]  # Combine conditions using AND
filtered_df

Unnamed: 0,Name,Age,City
3,Lisa,40,Tokyo


In [12]:
values = ['Lisa', 'B', 'C']
condition = df['Name'].isin(values)
filtered_df = df[condition]
filtered_df

Unnamed: 0,Name,Age,City
3,Lisa,40,Tokyo


### Filtering Columns

In [13]:
selected_columns = df[['Age', 'Name']]  # Select multiple columns
selected_columns 


Unnamed: 0,Age,Name
0,25,John
1,30,Jane
2,35,Mike
3,40,Lisa


In [14]:
selected_columns = df.filter(regex='Nam')
selected_columns 

Unnamed: 0,Name
0,John
1,Jane
2,Mike
3,Lisa


### Filtering with query()

In [16]:
filtered_df = df.query('Name == "Lisa" and Age >= 35')
filtered_df

Unnamed: 0,Name,Age,City
3,Lisa,40,Tokyo


## Sorting

In [17]:
sorted_df = df.sort_values('Age')
sorted_df 

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Jane,30,London
2,Mike,35,Paris
3,Lisa,40,Tokyo


In [18]:
sorted_df = df.sort_values(['Name', 'Age'])
sorted_df 

Unnamed: 0,Name,Age,City
1,Jane,30,London
0,John,25,New York
3,Lisa,40,Tokyo
2,Mike,35,Paris


In [19]:
sorted_df = df.sort_values('Age', ascending=False)
sorted_df 

Unnamed: 0,Name,Age,City
3,Lisa,40,Tokyo
2,Mike,35,Paris
1,Jane,30,London
0,John,25,New York


In [20]:
sorted_df = df.sort_index()
sorted_df 

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Jane,30,London
2,Mike,35,Paris
3,Lisa,40,Tokyo


In [22]:
sorted_df = df.sort_index(ascending=False)
sorted_df 

Unnamed: 0,Name,Age,City
3,Lisa,40,Tokyo
2,Mike,35,Paris
1,Jane,30,London
0,John,25,New York
