# Lab 2 â€” Python Tools/Libraries (NumPy, Pandas, Matplotlib, scikit-learn)

**Learning Objectives:**
1) Manipulate arrays with **NumPy**
2) Load & clean data using **Pandas**
3) Visualize with **Matplotlib**
4) Train a tiny model with **scikit-learn**

**Dataset used:** `Apartment.csv`


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


## 1) NumPy essentials

In [None]:

a = np.array([1, 2, 3, 4, 5])
print('a:', a)
print('shape:', a.shape)
print('dtype:', a.dtype)

b = np.array([[1, 2, 3], [4, 5, 6]])
print('\nb:\n', b)
print('shape:', b.shape)

print('a[0]=', a[0])
print('a[1:4]=', a[1:4])
print('b[0,2]=', b[0,2])

print('\na + 10 =', a + 10)
print('a * 2 =', a * 2)
print('mean(a)=', a.mean())
print('std(a)=', a.std())


### Broadcasting

In [None]:

X = np.array([[1,2,3],[4,5,6],[7,8,9]])
col_means = X.mean(axis=0)
print(X)
print(col_means)
print(X - col_means)


## 2) Pandas essentials

In [None]:

import os

csv_path = 'Apartment.csv'
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
else:
    df = pd.DataFrame({
        'area': [250, 310, 180, 400, 275],
        'distance': [5.2, 3.8, 7.0, 2.5, 4.1],
        'price': [450, 520, 320, 610, 480]
    })

df.head()


In [None]:

print(df.describe())

area = df['area']
near = df[df['distance'] < df['distance'].median()]
print("Rows below median distance:", near.shape[0])


### Missing values

In [None]:

df2 = df.copy()
df2.loc[0, 'area'] = np.nan
df2.loc[2, 'distance'] = np.nan

print(df2.isna().sum())

df2['area'].fillna(df2['area'].mean(), inplace=True)
df2['distance'].fillna(df2['distance'].mean(), inplace=True)
print(df2.isna().sum())


## 3) Matplotlib essentials

In [None]:

plt.scatter(df['area'], df['price'])
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('Price vs Area')
plt.grid(True)
plt.show()


## 4) Mini ML pipeline

In [None]:

X = df[['area', 'distance']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))


In [None]:

plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()])
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted")
plt.grid(True)
plt.show()
