<a href="https://colab.research.google.com/github/young78703/young78703.github.io/blob/main/melb_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# New Section

In [None]:
df = pd.read_csv('melb_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# convert the date column to datetime type
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
# extract the year and time parts as new columns
df['year'] = df['Date'].dt.year
df['month']= df['Date'].dt.month
df['day'] = df['Date'].dt.date
df['time'] = df ['Date'].dt.time
df['dayofweek'] = df['Date'].dt.weekday

In [None]:
import pandas as pd

def extract_datetime_features(df, column_name):
    """
    Convert a column in a Pandas DataFrame to datetime type and extract year, month, day, and
    day of the week as new columns.

    Parameters:
    - df: Pandas DataFrame
    - column_name: string, the name of the column to convert to datetime type

    Returns:
    - df: Pandas DataFrame with the new columns for year, month, day, and day of the week
    """
    # Convert the column to datetime type
    df[column_name] = pd.to_datetime(df[column_name], infer_datetime_format=True)

    # Extract year, month, and day as new columns
    if df[column_name].dt.year.any():
        df['year'] = df[column_name].dt.year
    else:
        df['year'] = pd.NaT

    if df[column_name].dt.month.any():
        df['month'] = df[column_name].dt.month
    else:
        df['month'] = pd.NaT

    if df[column_name].dt.day.any():
        df['day'] = df[column_name].dt.day
        df['dayofweek'] = df[column_name].dt.weekday
    else:
        df['day'] = pd.NaT
        df['dayofweek'] = pd.NaT

    return df

In [None]:
extract_datetime_features (df, 'Date')

In [None]:
import pandas as pd
import numpy as np

def impute_nulls(df):
    """
    Impute null values in a Pandas DataFrame based on the data type of each column.
    - For float columns, impute with the mean.
    - For integer columns, impute with the median.
    - For object columns, impute with the mode.
    - For datetime columns, impute with the most recent or most frequent date.
    - For timedelta columns, impute with the mode.
    - For bool columns, impute with the mode.
    - For category columns, impute with the mode.
    - For complex columns, impute with the mean.
    """
    # Get data types of all columns
    dtypes = df.dtypes

    # Iterate over all columns
    for col in df.columns:
        # Check if column contains null values
        if df[col].isnull().sum() > 0:
            # Get data type of column
            dtype = dtypes[col]
            # Impute null values based on data type
            if dtype == 'float64' or dtype == 'float32' or dtype == 'float16':
                df[col].fillna(df[col].mean(), inplace=True)
            elif dtype == 'int64' or dtype == 'int32' or dtype == 'int16' or dtype == 'int8':
                df[col].fillna(df[col].median(), inplace=True)
            elif dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'datetime64':
                df[col].fillna(method='bfill', inplace=True)
            elif dtype == 'timedelta64':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'bool':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype.name == 'category':
                df[col].fillna(df[col].mode()[0], inplace=True)
            elif dtype == 'complex64' or dtype == 'complex128':
                df[col].fillna(df[col].mean(), inplace=True)
    return df

In [None]:
impute_nulls(df)

In [None]:
df.info()

In [None]:
df['Postcode'].value_counts().plot.bar(figsize=(10,6))
plt.tight_layout()

In [None]:
df['Type'].value_counts().plot.bar(figsize=(10,6))
plt.tight_layout()

In [None]:
sns.scatterplot(data=df, x='Lattitude', y= 'Longtitude')

In [None]:
df.rename(columns={'Longtitude':'Longitude'},inplace=True)

In [None]:
df.head()

In [None]:
df['CouncilArea'].value_counts().plot.bar(figsize=(10,8))
plt.title('Counts by CouncilArea')
plt.xlabel('CouncilArea')
plt.ylabel('Counts')
plt.tight_layout()

In [None]:
df.groupby('CouncilArea').mean()['Price'].plot.bar(figsize=(10,8))
plt.title('Mean Price groupby CouncilArea')
plt.xlabel('CouncilArea')
plt.ylabel('Price')
plt.tight_layout()

In [None]:
import matplotlib.pyplot as plt

def plot_mean_by_group(df, dependent_var, independent_vars):
    """
    Generate bar plots of the mean of a dependent variable (numeric variable) grouped by one or more
    independent variables (categorical variables).

    Parameters:
    - df: Pandas DataFrame
    - dependent_var: string, the name of the dependent variable column
    - independent_vars: list of strings, the names of the independent variable columns

    Returns:
    - None
    """
    # Iterate over each independent variable and generate a bar plot of the mean of the dependent variable
    for var in independent_vars:
        grouped = df.groupby(var).mean()[dependent_var]
        grouped.plot.bar(figsize=(8,6))
        plt.title(f'Mean {dependent_var} groupby {var}')
        plt.xlabel(var)
        plt.ylabel(dependent_var)
        plt.tight_layout()
        plt.show()

In [None]:
plot_mean_by_group (df, 'Price', ['Type','Method','CouncilArea'])

In [None]:
df.columns

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),cmap='viridis',linewidths=1, linecolor='w', annot=True, fmt='.2f')

In [None]:
def plots_for_checking_outliers (data, column):
  import matplotlib.pyplot as plt
  import seaborn as sns
  import statsmodels.api as sm
  from scipy.stats import skew

  fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))

  # Histogram
  sns.histplot(data[column], ax=axes[0])
  axes[0].set_title(f"Histogram of {column}")

  # Box plot
  sns.boxplot(data[column], ax=axes[1])
  axes[1].set_title(f"Box plot of {column}")

  # Q-Q plot
  sm.qqplot(data[column], line='s', ax=axes[2])
  axes[2].set_title(f"Q-Q plot of {column} against a normal distribution")
  skewness = skew(data[column])
  axes[2].text(0.05, 0.95, f"Skewness: {skewness:.2f}", transform=axes[2].transAxes, ha='left', va='top')

  plt.tight_layout()
  plt.show()

In [None]:
plots_for_checking_outliers (df,'Price')

In [None]:
# Preprocess a categorical column (ordinal variable) using LabelEncoder
# Convert ordinal categorical variable to numeric variable
encoder = LabelEncoder()
encoded = encoder.fit_transform(df['Checking account'])
df['Checking account'] = encoded

In [None]:
clip_outliers_by_zscores(df, 'Price', 3.5, -3.5)

In [None]:
from scipy.stats import rankdata

def drop_outliers_by_percentiles(data, column, lower_percentile, upper_percentile):
    """
    Drops rows from a Pandas DataFrame based on percentiles of a given column.

    Parameters:
    data (pandas.DataFrame): The input data.
    column (str): The name of the column to use for computing percentiles.
    lower_percentile (float): The lower percentile bound (between 0 and 100).
    upper_percentile (float): The upper percentile bound (between 0 and 100).

    Returns:
    pandas.DataFrame: The modified DataFrame with outliers dropped.
    """
    # Check input arguments
    if column not in data.columns:
        raise ValueError("Column '%s' not found in data." % column)
    if not (0 <= lower_percentile <= 100):
        raise ValueError("Lower percentile bound must be between 0 and 100.")
    if not (0 <= upper_percentile <= 100):
        raise ValueError("Upper percentile bound must be between 0 and 100.")

    # Compute percentiles
    percentiles = pd.Series((rankdata(data[column]) / len(data)) * 100)

    # Drop outliers outside bounds
    mask = (percentiles >= upper_percentile) | (percentiles <= lower_percentile)
    return data.loc[~mask]

from scipy import stats

def drop_outliers_by_zscores(data, column, lower_zscore, upper_zscore):
    """
    Drops rows from a Pandas DataFrame based on z-scores of a given column.

    Parameters:
    data (pandas.DataFrame): The input data.
    column (str): The name of the column to use for computing z-scores.
    lower_zscore (float): The lower z-score boundary.
    upper_zscore (float): The upper z-score boundary.

    Returns:
    pandas.DataFrame: The modified DataFrame with outliers dropped.
    """
    # Check input arguments
    if column not in data.columns:
        raise ValueError("Column '%s' not found in data." % column)
    if not np.isfinite(lower_zscore):
        raise ValueError("Lower z-score boundary must be finite.")
    if not np.isfinite(upper_zscore):
        raise ValueError("Upper z-score boundary must be finite.")

    # Compute z-scores
    z_scores = pd.Series(stats.zscore(data[column]), index=data.index)

    # Drop outliers outside boundaries
    mask = (z_scores >= upper_zscore) | (z_scores <= lower_zscore)
    return data.loc[~mask]

def clip_outliers_by_zscores(data, column, upper_zscore, lower_zscore):
    """
    Clips the outliers of a column in a Pandas DataFrame based on z-scores.

    Parameters:
    data (pandas.DataFrame): The input data.
    column (str): The name of the column to clip.
    lower_zscore (float): The lower z-score boundary.
    upper_zscore (float): The upper z-score boundary.

    Returns:
    pandas.DataFrame: The modified DataFrame with outliers clipped.
    """
    # Check input arguments
    if column not in data.columns:
        raise ValueError("Column '%s' not found in data." % column)
    if not np.isfinite(lower_zscore):
        raise ValueError("Lower z-score boundary must be finite.")
    if not np.isfinite(upper_zscore):
        raise ValueError("Upper z-score boundary must be finite.")

    # Compute mean and standard deviation
    mean = np.mean(data[column])
    std_dev = np.std(data[column])

    # Compute lower and upper value bounds based on z-scores
    lower_value = lower_zscore * std_dev + mean
    upper_value = upper_zscore * std_dev + mean

    # Clip outliers
    data_clipped = data.copy()
    data_clipped[column] = data_clipped[column].clip(lower_value, upper_value)

    return data_clipped

def clip_outliers_by_percentiles(data, column, lower_percentile, upper_percentile):
    """
    Clips the outliers of a column in a Pandas DataFrame based on percentiles.

    Parameters:
    data (pandas.DataFrame): The input data.
    column (str): The name of the column to clip.
    lower_percentile (float): The lower percentile bound (between 0 and 100).
    upper_percentile (float): The upper percentile bound (between 0 and 100).

    Returns:
    pandas.DataFrame: The modified DataFrame with outliers clipped.
    """
    # Check input arguments
    if column not in data.columns:
        raise ValueError("Column '%s' not found in data." % column)
    if not (0 <= lower_percentile <= 100):
        raise ValueError("Lower percentile bound must be between 0 and 100.")
    if not (0 <= upper_percentile <= 100):
        raise ValueError("Upper percentile bound must be between 0 and 100.")

    # Compute percentiles
    p_upper = np.percentile(data[column], upper_percentile)
    p_lower = np.percentile(data[column], lower_percentile)

    # Clip outliers
    data[column] = data[column].clip(p_lower, p_upper)
    return data

In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor, OLSInfluence
from statsmodels.graphics.tsaplots import plot_acf


def check_regression_assumptions(data, dependent_var, drop_columns=[], vif_threshold=5):
    """
    This function produces various diagnostic plots and checks the assumptions of linear regression:
    Linearity, Normality, Homoscedasticity, absence of multicollinearity, and optionally, autocorrelation.
    """

    # Define the independent and dependent variables
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]

    # Fit a linear regression model
    model = sm.OLS(y, sm.add_constant(X)).fit()

    # Get the predicted values and residuals
    y_pred = model.predict(sm.add_constant(X))
    residuals = model.resid

    # Set up a grid for plotting multiple plots
    num_plots = 5
    fig, ax = plt.subplots(num_plots, 1, figsize=(6, 4 * num_plots))

    # Plot 1: Predicted Values vs. Residuals (Linearity)
    ax[0].scatter(y_pred, residuals)
    ax[0].set_xlabel('Predicted Values')
    ax[0].set_ylabel('Residuals')
    ax[0].set_title('Predicted Values vs. Residuals')

    # Plot 2: Q-Q Plot (Normality)
    sm.qqplot(residuals, line='s', ax=ax[1])
    ax[1].set_title("Q-Q Plot of Residuals")

    # Plot 3: Predicted Values vs. Standardized Residuals (Homoscedasticity)
    standardized_residuals = residuals / np.std(residuals)
    ax[2].scatter(y_pred, standardized_residuals)
    ax[2].set_xlabel('Predicted Values')
    ax[2].set_ylabel('Standardized Residuals')
    ax[2].set_title('Predicted Values vs. Standardized Residuals')

    # Plot 4: Cook's Distance
    cooks_distance = OLSInfluence(model).cooks_distance[0]
    ax[3].stem(cooks_distance, markerfmt=',', use_line_collection=True)
    ax[3].set_xlabel('Observation Index')
    ax[3].set_ylabel("Cook's Distance")
    ax[3].set_title("Cook's Distance Plot")

    # Plot 5: Autocorrelation
    plot_acf(residuals, ax=ax[4])
    ax[4].set_title("Autocorrelation of Residuals")

    # Display the plots
    plt.tight_layout()
    plt.show()

    # VIF (Multicollinearity)
    X_vif = X.copy()
    X_vif = sm.add_constant(X_vif)
    vif_data = pd.Series([variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])], index=X_vif.columns)
    vif_data = vif_data.drop('const')
    print("Variance Inflation Factors (VIF):\n", vif_data)

    multicollinear_columns = vif_data[vif_data > vif_threshold].index.tolist()
    if multicollinear_columns:
        print("The following variables have high multicollinearity:\n", multicollinear_columns)
    else:
        print("No variables have high multicollinearity.")


In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif, OLSInfluence
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler



def check_regression_assumptions(ModelClass, data, dependent_var, drop_columns=[], vif_threshold=10):
    """
    This function produces various diagnostic plots and checks the assumptions of linear regression:
    Linearity, Normality, Homoscedasticity, absence of multicollinearity, and optionally, autocorrelation.
    """

    # Define the independent and dependent variables
    X = data.drop([dependent_var] + drop_columns, axis=1)
    y = data[dependent_var]

    # Data scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Model fitting
    model = ModelClass()
    model.fit(X_scaled, y)
    y_pred = model.predict(X)
    residuals = y - y_pred

    # Set up a grid for plotting multiple plots
    num_plots = 4
    fig, ax = plt.subplots(num_plots, 1, figsize=(6, 4 * num_plots))

    # Plot 1: Predicted Values vs. Residuals (Linearity)
    ax[0].scatter(y_pred, residuals)
    ax[0].set_xlabel('Predicted Values')
    ax[0].set_ylabel('Residuals')
    ax[0].set_title('Predicted Values vs. Residuals')

    # Plot 2: Q-Q Plot (Normality)
    sm.qqplot(residuals, line='s', ax=ax[1])
    ax[1].set_title("Q-Q Plot of Residuals")

    # Plot 3: Predicted Values vs. Standardized Residuals (Homoscedasticity)
    standardized_residuals = residuals / np.std(residuals)
    ax[2].scatter(y_pred, standardized_residuals)
    ax[2].set_xlabel('Predicted Values')
    ax[2].set_ylabel('Standardized Residuals')
    ax[2].set_title('Predicted Values vs. Standardized Residuals')

    # Plot 4: Autocorrelation
    plot_acf(residuals, ax=ax[3])
    ax[3].set_title("Autocorrelation of Residuals")

    # Display the plots
    plt.tight_layout()
    plt.show()

    # 5. Variance Inflation Factors (VIF)

    VIF_df = pd.DataFrame()
    VIF_df["VIF Factor"] = [vif(X, i) for i in range(X.shape[1])]
    VIF_df["Predictor"] = X.columns
    return VIF_df



In [None]:
check_regression_assumptions(LinearRegression, data=df, dependent_var='Price', drop_columns=['Suburb', 'Address','Type','Method', 'Bedroom2', 'SellerG','Date','Postcode', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname'], vif_threshold=10)

In [None]:
check_regression_assumptions(data=df, dependent_var='Price', drop_columns=['Suburb', 'Address','Type','Method', 'Bedroom2', 'SellerG','Date',
                                                                          'Postcode', 'CouncilArea', 'Lattitude', 'Longtitude', 'Regionname'], vif_threshold=10)

In [None]:
df.columns