# Code-Based Questions Assignment
## Section 2.1.1: Data Format Converter

### Q1. Build a Python program that converts data between CSV, JSON, Excel, and Text formats.

In [100]:
import pandas as pd
import os

def convert_file(input_path, target_format):
    ext = os.path.splitext(input_path)[1].lower()
    if ext == '.csv': df = pd.read_csv(input_path)
    elif ext == '.json': df = pd.read_json(input_path)
    elif ext in ['.xlsx', '.xls']: df = pd.read_excel(input_path)
    elif ext == '.txt': df = pd.read_csv(input_path, sep='\t')
    
    output_path = f"output.{target_format}"
    if target_format == 'csv': df.to_csv(output_path, index=False)
    elif target_format == 'json': df.to_json(output_path, orient='records', indent=4)
    elif target_format == 'xlsx': df.to_excel(output_path, index=False)
    elif target_format == 'txt': df.to_csv(output_path, sep='\t', index=False)
    return f"Converted to {output_path}"

print("Converter function defined.")

Converter function defined.


### Q2. How will your program handle nested JSON structures during conversion?

In [101]:
import pandas as pd
import json

def handle_nested(json_data):
    # Pandas json_normalize flattens nested structures into columns
    return pd.json_normalize(json.loads(json_data))

sample = '{"id": 1, "info": {"name": "Alice", "loc": "NY"}}'
print(handle_nested(sample))

   id info.name info.loc
0   1     Alice       NY


### Q3. How do you validate data types and detect missing values during conversion?

In [102]:
def validate(df):
    # Detect missing values
    print("Missing Values:\n", df.isnull().sum())
    # Validate data types
    print("\nData Types:\n", df.dtypes)

validate(pd.DataFrame({'A': [1, None], 'B': ['x', 'y']}))

Missing Values:
 A    1
B    0
dtype: int64

Data Types:
 A    float64
B        str
dtype: object


### Q4. Design a command-line interface (CLI) for selecting input and output formats.

In [103]:
import argparse

def cli():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Input path")
    parser.add_argument("-f", "--format", choices=['csv', 'json', 'xlsx', 'txt'])
    print("CLI Parser ready.")

cli()

CLI Parser ready.


### Q5. Generate a data quality report showing missing values, data types, and inconsistencies.

In [104]:
def quality_report(df):
    return pd.DataFrame({
        'Type': df.dtypes,
        'Missing': df.isnull().sum(),
        'Unique': df.nunique()
    })

print("Quality report generator defined.")

Quality report generator defined.


## Section 2.2.1: Student Management System

### Q6. Design a relational database schema for managing students, courses, enrollments, and attendance.

In [105]:
import sqlite3
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()
cursor.executescript("""
CREATE TABLE Students (id INTEGER PRIMARY KEY, name TEXT, email TEXT);
CREATE TABLE Courses (id INTEGER PRIMARY KEY, title TEXT);
CREATE TABLE Enrollments (id INTEGER PRIMARY KEY, sid INT, cid INT, grade TEXT);
CREATE TABLE Attendance (id INTEGER PRIMARY KEY, eid INT, date DATE, status TEXT);
""")
print("Schema created in memory database.")

Schema created in memory database.


### Q7. Write SQL queries to calculate the GPA of each student.

In [106]:
gpa_query = """
SELECT sid, 
AVG(CASE grade WHEN 'A' THEN 4.0 WHEN 'B' THEN 3.0 WHEN 'C' THEN 2.0 ELSE 0.0 END) as GPA
FROM Enrollments GROUP BY sid;
"""
print("GPA query designed.")

GPA query designed.


### Q8. Generate attendance reports for individual students and courses.

In [107]:
att_query = """
SELECT sid, cid, 
SUM(CASE WHEN status='P' THEN 1 ELSE 0 END)*100.0/COUNT(*) as Percent
FROM Attendance a JOIN Enrollments e ON a.eid = e.id
GROUP BY sid, cid;
"""
print("Attendance report query designed.")

Attendance report query designed.


### Q9. Analyze course performance using enrollment and grade data.

In [108]:
perf_query = """
SELECT cid, AVG(CASE grade WHEN 'A' THEN 4.0 ELSE 0.0 END) as AvgGrade, COUNT(*) as Count
FROM Enrollments GROUP BY cid;
"""
print("Course performance query designed.")

Course performance query designed.


### Q10. Identify at-risk students based on grades and attendance patterns.

In [109]:
risk_query = """
SELECT sid FROM Enrollments 
WHERE sid IN (SELECT sid FROM Enrollments GROUP BY sid HAVING AVG(grade) < 2.0)
OR sid IN (SELECT sid FROM Attendance a JOIN Enrollments e ON a.eid=e.id GROUP BY sid HAVING AVG(status='P') < 0.75);
"""
print("At-risk identification query designed.")

At-risk identification query designed.


## Section 2.3.1: Accessing and processing data from APIs

### Q11. Fetch weather data from at least two different weather APIs.

In [110]:
import requests
def fetch_weather(api_url): return requests.get(api_url).json()
print("Weather fetcher ready.")

Weather fetcher ready.


### Q12. How do you securely manage API keys in your application?

In [111]:
import os
key = os.getenv('API_KEY')
print("Using environment variables for keys.")

Using environment variables for keys.


### Q13. Handle API rate limits and failed requests gracefully.

In [112]:
import time
def safe_get(url):
    for i in range(3):
        r = requests.get(url)
        if r.status_code == 200: return r.json()
        time.sleep(1)
    return None
print("Retry logic implemented.")

Retry logic implemented.


### Q14. Normalize weather data obtained from different APIs into a common format.

In [113]:
def normalize(data, source):
    if source == 'A': return {'temp': data['t'], 'humidity': data['h']}
    return {'temp': data['temp'], 'humidity': data['hum']}
print("Normalization function defined.")

Normalization function defined.


### Q15. Compare daily weather reports and forecasts from multiple sources.

In [114]:
def compare(w1, w2):
    return abs(w1['temp'] - w2['temp'])
print("Comparison function defined.")

Comparison function defined.


### Q16. Implement a basic alert system based on weather conditions.

In [115]:
def alert(data):
    if data['temp'] > 40: print("Heat Alert!")
print("Alert system ready.")

Alert system ready.


## Section 2.4.1: Web Scraping

### Q17. Scrape news articles from multiple websites while following ethical scraping practices.

In [116]:
from bs4 import BeautifulSoup
def scrape(url): 
    r = requests.get(url, headers={'User-Agent': 'ResearchBot'})
    return BeautifulSoup(r.text, 'html.parser')
print("Ethical scraper ready.")

Ethical scraper ready.


### Q18. How do you ensure compliance with robots.txt and terms of service?

In [117]:
from urllib.robotparser import RobotFileParser
def check_robots(url): 
    rp = RobotFileParser()
    rp.set_url(url + "/robots.txt")
    return rp.can_fetch("*", url)
print("Robots.txt checker ready.")

Robots.txt checker ready.


### Q19. Extract headlines, full content, authors, publication dates, and categories from news pages.

In [118]:
def extract(soup):
    return {
        'headline': soup.find('h1').text,
        'author': soup.find('.author').text
    }
print("Extraction logic defined.")

Extraction logic defined.


### Q20. Store the scraped news data in a structured format for analysis.

In [119]:
import pandas as pd
def save(data): pd.DataFrame(data).to_csv('news.csv')
print("Storage function ready.")

Storage function ready.


### Q21. Analyze trends or patterns in the collected news data.

In [120]:
def analyze(df): return df['category'].value_counts()
print("Analysis function ready.")

Analysis function ready.


## Section 2.5.1: Large Datasets

### Q22. Process CSV files that are larger than available system memory.

In [121]:
def process_large(file):
    for chunk in pd.read_csv(file, chunksize=1000): pass
print("Large file processor ready.")

Large file processor ready.


### Q23. Explain how chunk-based processing works in pandas.

In [122]:
print("Chunking yields TextFileReader objects allowing iterative processing of slices.")

Chunking yields TextFileReader objects allowing iterative processing of slices.


### Q24. Monitor and limit memory usage while processing large datasets.

In [123]:
import psutil
print(f"RAM: {psutil.virtual_memory().percent}%")

RAM: 63.0%


### Q25. Optimize file I/O operations for large-scale data.

In [124]:
print("Using Parquet or Feather formats for faster I/O.")

Using Parquet or Feather formats for faster I/O.


### Q26. Track and display progress while processing large files.

In [125]:
from tqdm import tqdm
for i in tqdm(range(100)): pass

100%|██████████| 100/100 [00:00<00:00, 1815716.02it/s]


## Capstone Project

### Q28. Choose a real-world domain and identify relevant data sources.

In [126]:
print("Domain: E-commerce. Sources: Store API, Competitor Web Scraping, Sales CSV.")

Domain: E-commerce. Sources: Store API, Competitor Web Scraping, Sales CSV.


### Q29. Design a complete data pipeline including ingestion, storage, processing, API, and visualization layers.

In [127]:
print("Pipeline: Airflow -> PostgreSQL -> Spark -> Flask -> Streamlit.")

Pipeline: Airflow -> PostgreSQL -> Spark -> Flask -> Streamlit.
