In [1]:
# read the data 
import pandas as pd

music_df=pd.read_csv('music.csv')

# missing values
print(music_df.isna().sum().sort_values())

diabetes_df=pd.read_csv('diabetes.csv')
print(diabetes_df.isna().sum().sort_values())

### The outputs indicate that there's no missing data for each of two df 
### IF it does, sort in ascending with count of missing data 

Unnamed: 0                  0
valence                     0
instrumentalness            0
acousticness                0
loudness                    0
danceability                0
feelings                    0
sadness                     0
like/girls                  0
family/spiritual            0
light/visual perceptions    0
movement/places             0
energy                      0
music                       0
communication               0
romantic                    0
family/gospel               0
shake the audience          0
night/time                  0
world/life                  0
violence                    0
dating                      0
len                         0
genre                       0
release_date                0
obscene                     0
age                         0
dtype: int64
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                        

In [5]:
### Dropping missing data 
music_df=music_df.dropna(subset=['genre','loudness','instrumentalness','energy'])

In [None]:
"""
A better option than droping columns with missing values ==> Imputation
-- Imputation fills in the missing values with some number. 
-- For instance, we can fill in the mean value along each column. (mean/median/most_frequent)
-- We must split data before imputation to avoid Data Leakage.

The imputed value won't be exactly right in most cases, but it usually leads to more accurate models than 
you would get from dropping the column entirely.

"""
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import numpy as np
# Use melb_data and use information such as the number of rooms and land size to predict home price.
melb_df=pd.read_csv('melb_data.csv')

# missing values
print(melb_df.isna().sum().sort_values())

X= melb_df.drop("Price",axis=1)
y= melb_df['Price']


X_train, X_test, y_train, y_test=train_test_split(X,y,train_size=0.8, test_size=0.2,
                                                 random_state=12)

# List numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('scaler', StandardScaler())  # Scale features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Define the complete pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression())
])

pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f"Model accuracy: {score:.2f}")

In [None]:
# Star_data set
star_df=pd.read_csv('star_dataset.csv')
star_df

# print the missing values
print(star_df.isna().sum().sort_values)

# drop values where less than 5% are missing

star_df=star_df.dropna(subset=['Distance','Luminosity','Radius'])

star_dummies=pd.get_dummies(star_df,drop_first=True).astype(int)
star_dummies.head(100)