<a href="https://colab.research.google.com/github/touhid71/pandas_numPy_metrics/blob/master/Assisment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# 1. Create arrays
arr1 = np.arange(0, 21)  # 1D array from 0 to 20
arr2 = np.random.randint(10, 100, size=(4, 5))  # 2D array 4x5 random ints 10–99
identity_matrix = np.eye(3)  # 3x3 identity matrix

print("1D Array:\n", arr1)
print("\n2D Array (4x5):\n", arr2)
print("\nIdentity Matrix (3x3):\n", identity_matrix)

# 2. Mean, median, std of 2D array
print("\nMean of 2D array:", np.mean(arr2))
print("Median of 2D array:", np.median(arr2))
print("Standard Deviation of 2D array:", np.std(arr2))

# 3. Slice operations
print("\nSecond row:", arr2[1])
print("Third column:", arr2[:, 2])

# 4. Element-wise multiplication
arr3 = np.random.randint(1, 10, size=(4, 5))
print("\nAnother 4x5 Array:\n", arr3)
print("\nElement-wise Multiplication:\n", arr2 * arr3)

1D Array:
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]

2D Array (4x5):
 [[52 60 74 71 47]
 [20 37 19 30 64]
 [26 31 60 14 60]
 [77 73 48 57 66]]

Identity Matrix (3x3):
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

Mean of 2D array: 49.3
Median of 2D array: 54.5
Standard Deviation of 2D array: 19.68019308848366

Second row: [20 37 19 30 64]
Third column: [74 19 60 48]

Another 4x5 Array:
 [[3 2 4 2 9]
 [6 9 2 2 1]
 [9 4 2 3 8]
 [8 1 2 3 6]]

Element-wise Multiplication:
 [[156 120 296 142 423]
 [120 333  38  60  64]
 [234 124 120  42 480]
 [616  73  96 171 396]]


In [2]:
#Broadcasting and Reshaping

# 1. Create 1D array and reshape
import numpy as np

arr = np.arange(1, 17)  # 1D array from 1 to 16
matrix = arr.reshape(4, 4)
print("Original 4x4 Matrix:\n", matrix)

# 2. Add [1,2,3,4] to each row (broadcasting)
add_arr = np.array([1, 2, 3, 4])
result = matrix + add_arr
print("\nAfter Broadcasting Addition:\n", result)

# 3. Flatten back to 1D
flattened = result.flatten()
print("\nFlattened 1D Array:\n", flattened)

Original 4x4 Matrix:
 [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]

After Broadcasting Addition:
 [[ 2  4  6  8]
 [ 6  8 10 12]
 [10 12 14 16]
 [14 16 18 20]]

Flattened 1D Array:
 [ 2  4  6  8  6  8 10 12 10 12 14 16 14 16 18 20]


In [3]:
# Pandas – Data Manipulation(Working with DataFrames)
import pandas as pd

# 1. Create DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "Diana", "Evan"],
    "Age": [25, 30, 28, 35, 40],
    "Department": ["IT", "HR", "IT", "Marketing", "HR"],
    "Salary": [50000, 45000, 72000, 58000, 60000]
}

df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

# 2. Add Bonus column (10% of salary)
df["Bonus"] = df["Salary"] * 0.10
print("\nDataFrame with Bonus:\n", df)

# 3. Filter HR employees
hr_employees = df[df["Department"] == "HR"]
print("\nEmployees in HR Department:\n", hr_employees)

# 4. Average salary by department
avg_salary = df.groupby("Department")["Salary"].mean()
print("\nAverage Salary by Department:\n", avg_salary)

# 5. Save to CSV
df.to_csv("employees.csv", index=False)
print("\n✅ DataFrame saved as 'employees.csv'")

Original DataFrame:
       Name  Age Department  Salary
0    Alice   25         IT   50000
1      Bob   30         HR   45000
2  Charlie   28         IT   72000
3    Diana   35  Marketing   58000
4     Evan   40         HR   60000

DataFrame with Bonus:
       Name  Age Department  Salary   Bonus
0    Alice   25         IT   50000  5000.0
1      Bob   30         HR   45000  4500.0
2  Charlie   28         IT   72000  7200.0
3    Diana   35  Marketing   58000  5800.0
4     Evan   40         HR   60000  6000.0

Employees in HR Department:
    Name  Age Department  Salary   Bonus
1   Bob   30         HR   45000  4500.0
4  Evan   40         HR   60000  6000.0

Average Salary by Department:
 Department
HR           52500.0
IT           61000.0
Marketing    58000.0
Name: Salary, dtype: float64

✅ DataFrame saved as 'employees.csv'


In [4]:
#  Data Cleaning & Analysis
import seaborn as sns

# Load sample dataset
df_titanic = sns.load_dataset("titanic")

# 1. First 5 and last 5 rows
print("First 5 rows:\n", df_titanic.head())
print("\nLast 5 rows:\n", df_titanic.tail())

# 2. Shape
print("\nShape of dataset:", df_titanic.shape)

# 3. Columns and data types
print("\nColumns and Data Types:\n", df_titanic.dtypes)

# 4. Handle missing values (fill with median/most frequent)
df_titanic["age"].fillna(df_titanic["age"].median(), inplace=True)
df_titanic["embarked"].fillna(df_titanic["embarked"].mode()[0], inplace=True)
df_titanic.dropna(subset=["fare"], inplace=True)

# 5. Basic statistics
print("\nBasic Statistics:\n", df_titanic.describe())

print("\n✅ Data Cleaning Completed.")

First 5 rows:
    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Last 5 rows:
      survived  pclass     sex   age  sibsp  parch   fare embarked   class  \
886         0       2    male  27.0      0      0  13.00        S  Second   
887         1       1  female  19.0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_titanic["age"].fillna(df_titanic["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_titanic["embarked"].fillna(df_titanic["embarked"].mode()[0], inplace=True)


In [5]:
#  Classification Evaluation
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target

# Binary classification: Versicolor vs not
df["binary_target"] = (df["target"] == 1).astype(int)

# Split data
X = df[iris.feature_names]
y = df["binary_target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Confusion Matrix & Metrics
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print(f"\nAccuracy: {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")
print(f"F1 Score: {f1:.2f}")

# Explanation
print("""
📘 Metric Explanations:
- Accuracy: Percentage of correctly predicted samples.
- Precision: Of all predicted positives, how many are actually positive.
- Recall: Of all actual positives, how many did the model correctly identify.
- F1-Score: Harmonic mean of Precision and Recall — balances both.
""")

Confusion Matrix:
 [[29  3]
 [ 9  4]]

Accuracy: 0.73
Precision: 0.57
Recall: 0.31
F1 Score: 0.40

📘 Metric Explanations:
- Accuracy: Percentage of correctly predicted samples.
- Precision: Of all predicted positives, how many are actually positive.
- Recall: Of all actual positives, how many did the model correctly identify.
- F1-Score: Harmonic mean of Precision and Recall — balances both.

