In [2]:
import pandas as pd

# Sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'Salary': [50000, 60000, 70000]}

df = pd.DataFrame(data)


In [3]:
# Selecting a single column
print(df['Age'])  # Returns a Series

0    25
1    30
2    35
Name: Age, dtype: int64


In [4]:
# Selecting multiple columns
print(df[['Name', 'Salary']])  # Returns a DataFrame

      Name  Salary
0    Alice   50000
1      Bob   60000
2  Charlie   70000


In [5]:
# Selecting a row using loc (label-based)
print(df.loc[1])  # Retrieves the second row (index 1)

Name        Bob
Age          30
Salary    60000
Name: 1, dtype: object


In [6]:
# Selecting a row using iloc (position-based)
print(df.iloc[2])  # Retrieves the third row (index 2)

Name      Charlie
Age            35
Salary      70000
Name: 2, dtype: object


In [7]:
# Filtering data
print(df[df['Age'] > 25])  # Returns rows where Age > 25

      Name  Age  Salary
1      Bob   30   60000
2  Charlie   35   70000


In [8]:
# Adding a new column
df['Bonus'] = df['Salary'] * 0.1
print(df)

      Name  Age  Salary   Bonus
0    Alice   25   50000  5000.0
1      Bob   30   60000  6000.0
2  Charlie   35   70000  7000.0


In [9]:
# Updating values
df.at[0, 'Salary'] = 55000  # Update using at
df.loc[df['Name'] == 'Bob', 'Age'] = 32  # Update with condition

In [10]:
# Deleting a column
df.drop(columns=['Bonus'], inplace=True)
print(df)

      Name  Age  Salary
0    Alice   25   55000
1      Bob   32   60000
2  Charlie   35   70000


In [11]:
# Basic statistics
print(df.describe())  # Summary statistics
print(df.mean(numeric_only=True))  # Mean of numeric columns

             Age        Salary
count   3.000000      3.000000
mean   30.666667  61666.666667
std     5.131601   7637.626158
min    25.000000  55000.000000
25%    28.500000  57500.000000
50%    32.000000  60000.000000
75%    33.500000  65000.000000
max    35.000000  70000.000000
Age          30.666667
Salary    61666.666667
dtype: float64


In [12]:
# Grouping
grouped = df.groupby('Age').sum()
print(grouped)

        Name  Salary
Age                 
25     Alice   55000
32       Bob   60000
35   Charlie   70000


In [13]:
# Sorting by column
df_sorted = df.sort_values(by='Salary', ascending=False)

# Reset index
df_reset = df_sorted.reset_index(drop=True)

# Setting a column as index
df_indexed = df.set_index('Name')

In [14]:
# Introducing NaN values
df.loc[1, 'Salary'] = None

# Checking for missing values
print(df.isna().sum())

# Filling missing values
df.fillna({'Salary': df['Salary'].mean()}, inplace=True)

# Dropping rows with NaN
df.dropna(inplace=True)


Name      0
Age       0
Salary    1
dtype: int64


In [15]:
df2 = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Department': ['HR', 'IT']})

# Merging (like SQL JOIN)
df_merged = pd.merge(df, df2, on='Name', how='left')

# Concatenation
df_concat = pd.concat([df, df2], axis=0)

# Joining (using index)
df.set_index('Name', inplace=True)
df2.set_index('Name', inplace=True)
df_joined = df.join(df2, how='left')

In [16]:
# Applying a function to a column
df['Age_Category'] = df['Age'].apply(lambda x: 'Young' if x < 30 else 'Old')

# Applying row-wise operations
df['Salary_After_Tax'] = df.apply(lambda row: row['Salary'] * 0.8, axis=1)


In [18]:
df_pivot = df.pivot_table(values='Salary', index='Age', aggfunc='mean')

# Reset index to use 'Name' column
df_reset_index = df.reset_index()

# Crosstab for categorical counts
print(pd.crosstab(df_reset_index['Age'], df_reset_index['Name']))

Name  Alice  Bob  Charlie
Age                      
25        1    0        0
32        0    1        0
35        0    0        1
