In [None]:
# =====================================================================================
# üìò DAY 4 ‚Äì STRING OPERATIONS IN PANDAS
# Author: Tanuja Mannem
# Description:
#     Learn to clean and manipulate textual data using Pandas string functions.
#     Covers vectorized string methods, regex extraction, replacements, and text cleaning.
#     Includes theory, examples, practice tasks with answers, and interview questions.
# =====================================================================================

import pandas as pd

# =====================================================================================
# 1. OVERVIEW / THEORY
# =====================================================================================
# Pandas provides vectorized string operations via the `.str` accessor.
# These allow applying string functions efficiently to entire Series objects
# without using Python loops.
#
# Key Advantages:
#   - Fast and efficient (vectorized)
#   - Supports regex for complex pattern matching
#   - Works seamlessly with missing (NaN) values
#
# Common Operations:
#   - Cleaning: lower(), upper(), strip(), replace()
#   - Searching: contains(), startswith(), endswith()
#   - Splitting and Extracting: split(), extract(), get_dummies()
#   - Combining: cat()
# =====================================================================================


# =====================================================================================
# 2. SAMPLE DATAFRAME
# =====================================================================================
data = {
    'Name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'David Lee', 'Esha Kapoor'],
    'Email': ['alice.johnson@gmail.com', 'bob.smith@yahoo.com', 'charlie.brown@outlook.com', 'davidlee@gmail.com', 'esha.kapoor@company.in'],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'],
    'City': ['New York ', ' London', 'Paris', 'Berlin ', ' Mumbai ']
}

df = pd.DataFrame(data)
print("\n--- Original DataFrame ---")
print(df)


# =====================================================================================
# 3. BASIC STRING METHODS
# =====================================================================================
# Theory:
#   - .str.upper() / .str.lower() ‚Üí Convert case
#   - .str.strip() ‚Üí Remove spaces
#   - .str.len() ‚Üí String length
# =====================================================================================

print("\n--- Basic String Operations ---")
df['Name_Upper'] = df['Name'].str.upper()
df['City_Cleaned'] = df['City'].str.strip()
df['Email_Length'] = df['Email'].str.len()
print(df[['Name', 'Name_Upper', 'City', 'City_Cleaned', 'Email_Length']])


# =====================================================================================
# 4. STRING SEARCHING & FILTERING
# =====================================================================================
# Theory:
#   - .str.contains('pattern') ‚Üí Returns boolean mask
#   - .str.startswith('prefix') / .str.endswith('suffix')
# =====================================================================================

print("\n--- Searching Examples ---")
print("Emails containing 'gmail':")
print(df[df['Email'].str.contains('gmail')])

print("\nNames starting with 'A':")
print(df[df['Name'].str.startswith('A')])


# =====================================================================================
# 5. SPLITTING & EXTRACTING STRINGS
# =====================================================================================
# Theory:
#   - .str.split(delimiter, expand=True) ‚Üí Split into columns
#   - .str.extract(regex) ‚Üí Extract substring using regex pattern
# =====================================================================================

print("\n--- Splitting and Extracting ---")
df[['FirstName', 'LastName']] = df['Name'].str.split(' ', expand=True)
df['Domain'] = df['Email'].str.extract(r'@([\w.-]+)')
print(df[['Name', 'FirstName', 'LastName', 'Domain']])


# =====================================================================================
# 6. REPLACING AND CLEANING TEXT
# =====================================================================================
# Theory:
#   - .str.replace(old, new, regex=True/False)
#   - Useful for cleaning unwanted characters or symbols.
# =====================================================================================

print("\n--- Replacing and Cleaning ---")
df['Email_Provider_Clean'] = df['Domain'].str.replace(r'\.com|\.in', '', regex=True)
print(df[['Email', 'Email_Provider_Clean']])


# =====================================================================================
# 7. REGEX OPERATIONS
# =====================================================================================
# Theory:
#   - Regular expressions allow advanced text extraction/replacement.
#   - Example patterns:
#       \d+ ‚Üí digits
#       [A-Za-z]+ ‚Üí letters
#       @([\w.-]+) ‚Üí domain in an email
# =====================================================================================

print("\n--- Regex Examples ---")
df['Email_Prefix'] = df['Email'].str.extract(r'^([\w.-]+)@')
print(df[['Email', 'Email_Prefix']])


# =====================================================================================
# 8. COMBINING STRING COLUMNS
# =====================================================================================
# Theory:
#   - Use .str.cat() to concatenate strings across columns.
#   - You can specify separators with sep parameter.
# =====================================================================================

print("\n--- Combining String Columns ---")
df['Full_Info'] = df['FirstName'].str.cat(df['Department'], sep=" - ")
print(df[['Full_Info']])


# =====================================================================================
# 9. STRING METHODS SUMMARY TABLE
# =====================================================================================
# Commonly used string methods:
#   .str.upper(), .str.lower(), .str.title()
#   .str.strip(), .str.replace()
#   .str.contains(), .str.startswith(), .str.endswith()
#   .str.extract(), .str.split(), .str.cat()
# =====================================================================================


# =====================================================================================
# 10. PRACTICE TASKS (with Answers)
# =====================================================================================
print("\n================= PRACTICE TASKS & ANSWERS =================")

# Task 1: Extract only usernames from 'Email'
print("\nTask 1 ‚Äî Extract usernames:")
df['Username'] = df['Email'].str.extract(r'^([\w.-]+)')
print(df[['Email', 'Username']])

# Task 2: Find employees whose email contains 'yahoo'
print("\nTask 2 ‚Äî Emails containing 'yahoo':")
ans2 = df[df['Email'].str.contains('yahoo')]
print(ans2[['Name', 'Email']])

# Task 3: Clean city names (remove spaces and convert to uppercase)
print("\nTask 3 ‚Äî Clean City names:")
df['City_Cleaned'] = df['City'].str.strip().str.upper()
print(df[['City', 'City_Cleaned']])

# Task 4: Split 'Name' into 'First' and 'Last'
print("\nTask 4 ‚Äî Split Names:")
df[['First', 'Last']] = df['Name'].str.split(' ', expand=True)
print(df[['First', 'Last']])

# Task 5: Extract domain provider from email (without .com/.in)
print("\nTask 5 ‚Äî Extract domain:")
df['Provider'] = df['Email'].str.extract(r'@([\w.-]+)')
df['Provider'] = df['Provider'].str.replace(r'\.com|\.in', '', regex=True)
print(df[['Email', 'Provider']])

# Task 6: Find all names ending with 'n'
print("\nTask 6 ‚Äî Names ending with 'n':")
ans6 = df[df['Name'].str.endswith('n')]
print(ans6[['Name']])

# Task 7: Replace ‚Äúgmail‚Äù with ‚ÄúGMAIL‚Äù in Email column
print("\nTask 7 ‚Äî Replace gmail ‚Üí GMAIL:")
df['Email'] = df['Email'].str.replace('gmail', 'GMAIL')
print(df[['Email']])

# Task 8: Add a new column combining Name and Department
print("\nTask 8 ‚Äî Combine Name and Department:")
df['Name_Dept'] = df['Name'].str.cat(df['Department'], sep=" | ")
print(df[['Name_Dept']])

# Task 9: Count number of characters in each Name
print("\nTask 9 ‚Äî Name length count:")
df['Name_Length'] = df['Name'].str.len()
print(df[['Name', 'Name_Length']])

# Task 10: Extract email domains ending with ‚Äú.com‚Äù
print("\nTask 10 ‚Äî Domains ending with .com:")
ans10 = df[df['Email'].str.endswith('.com')]
print(ans10[['Name', 'Email']])


# =====================================================================================
# 11. INTERVIEW QUESTIONS (with detailed answers)
# =====================================================================================

# 1Ô∏è‚É£ What is the use of the .str accessor in Pandas?
#     ‚Üí It allows applying vectorized string operations on entire Series efficiently.

# 2Ô∏è‚É£ Difference between str.split() and str.extract()?
#     ‚Üí str.split() splits based on a delimiter; str.extract() uses regex to capture specific patterns.

# 3Ô∏è‚É£ How to remove leading/trailing spaces?
#     ‚Üí Use .str.strip()

# 4Ô∏è‚É£ How to check if a column value contains a substring (case-insensitive)?
#     ‚Üí df['col'].str.contains('text', case=False, na=False)

# 5Ô∏è‚É£ How to replace multiple substrings at once?
#     ‚Üí df['col'].replace({'old1':'new1','old2':'new2'}, regex=True)

# 6Ô∏è‚É£ How to extract only the domain name from an email?
#     ‚Üí df['Email'].str.extract(r'@([\w.-]+)')

# 7Ô∏è‚É£ How to count substring occurrences in each cell?
#     ‚Üí df['col'].str.count('substring')

# 8Ô∏è‚É£ How to concatenate multiple string columns with separator?
#     ‚Üí df['new'] = df['col1'].str.cat(df['col2'], sep=' - ')

# 9Ô∏è‚É£ How to handle NaN values while doing string operations?
#     ‚Üí Use df['col'].fillna('') or set na=False in .str.contains()

# üîü How to extract numbers from text like ‚ÄúOrder123‚Äù?
#     ‚Üí df['col'].str.extract(r'(\d+)')

# 1Ô∏è‚É£1Ô∏è‚É£ How to find all rows where emails end with ‚Äú.in‚Äù?
#     ‚Üí df[df['Email'].str.endswith('.in')]

# 1Ô∏è‚É£2Ô∏è‚É£ How to perform regex-based replacements?
#     ‚Üí df['col'].str.replace(r'\d+', '', regex=True) ‚Üí removes all numbers.
