# **PYTHON BASICS**

In [5]:
#1 import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Automatically display plots inside notebook
%matplotlib inline

# 2. LOAD THE DATASET
from google.colab import files
uploaded = files.upload()

# Load CSV into DataFrame
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSE(extra)/student-dataset.csv")

# Show first few rows
print("----- Head of Dataset -----")
df.head()

# Dataset info
print("----- Dataset Info -----")
df.info()

# Basic statistics
print("----- Statistical Summary -----")
df.describe(include="all")

# 3. DATA CLEANING
# Check missing values
print("----- Missing Values -----")
print(df.isnull().sum())

# Fill missing numerical values with mean
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Fill missing categorical values with mode
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

print("After Cleaning:")
print(df.isnull().sum())

# 4. NUMPY OPERATIONS

print("\n===== NumPy Operations =====")

# Convert a column to numpy array
if 'Marks' in df.columns:
    marks = df['Marks'].values
else:
    marks = df.select_dtypes(include=np.number).iloc[:,0].values  # safety fallback

print("NumPy Array of Marks:", marks[:10])

# Mean, Median, Standard Deviation
print("Mean:", np.mean(marks))
print("Median:", np.median(marks))
print("Standard Deviation:", np.std(marks))

# Create random matrix
rand_matrix = np.random.randint(50,100,(5,5))
print("\nRandom 5x5 Matrix:\n", rand_matrix)

# Matrix operations
print("Matrix Sum:", np.sum(rand_matrix))
print("Matrix Max:", np.max(rand_matrix))
print("Matrix Min:", np.min(rand_matrix))

# 5. PANDAS OPERATIONS

print("\n===== Pandas Data Analysis =====")

# Display first & last 5 rows
print(df.head())
print(df.tail())

# Sorting (e.g., by Marks)
df_sorted = df.sort_values(by=df.select_dtypes(include=np.number).columns[0], ascending=False)
print(df_sorted.head())

# Grouping (example: by Gender)
if 'Gender' in df.columns:
    print(df.groupby('Gender').mean(numeric_only=True))

# Filtering (students scoring above 80)
num_col = df.select_dtypes(include=np.number).columns[0]
high_scores = df[df[num_col] > 80]
print("\nStudents with high marks (>80):")
print(high_scores.head())

# 6. MATPLOTLIB VISUALIZATION

print("\n===== Matplotlib Visualizations =====")

# 1. Histogram
plt.figure(figsize=(8,5))
plt.hist(marks, bins=10)
plt.title("Distribution of Marks")
plt.xlabel("Marks")
plt.ylabel("Frequency")
plt.show()

# 2. Bar Plot (example: gender count)
if 'Gender' in df.columns:
    gender_counts = df['Gender'].value_counts()
    plt.figure(figsize=(7,5))
    plt.bar(gender_counts.index, gender_counts.values)
    plt.title("Gender Count")
    plt.xlabel("Gender")
    plt.ylabel("Count")
    plt.show()

# 3. Line Plot
plt.figure(figsize=(8,5))
plt.plot(marks[:50])  # first 50 students
plt.title("Marks Trend (First 50 Students)")
plt.xlabel("Student Index")
plt.ylabel("Marks")
plt.show()

# 4. Scatter Plot (Marks vs another numeric column)
num_cols_list = df.select_dtypes(include=np.number).columns
if len(num_cols_list) >= 2:
    plt.figure(figsize=(8,5))
    plt.scatter(df[num_cols_list[0]], df[num_cols_list[1]])
    plt.title(f"{num_cols_list[0]} vs {num_cols_list[1]}")
    plt.xlabel(num_cols_list[0])
    plt.ylabel(num_cols_list[1])
    plt.show()

# 7. SAVE CLEANED DATASET

df.to_csv("student-dataset-cleaned.csv", index=False)
print("\nCleaned dataset saved as: student-dataset-cleaned.csv")