In [None]:
#@title Copyright 2023 Google LLC. Double-click for license information.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Colabs

Machine Learning Crash Course uses Colaboratories (Colabs) for all programming exercises. Colab is Google's implementation of [Jupyter Notebook](https://jupyter.org/). For more information about Colabs and how to use them, go to [Welcome to Colaboratory](https://research.google.com/colaboratory).

# Numerical data: Statistics on a dataset

This Colab programming exercise (first of two) is part of the Machine Learning Crash Course module [Working with numerical data](https://developers.google.com/machine-learning/crash-course/numerical-data).

## What to expect

In the section, [First steps with numerical data](https://developers.google.com/machine-learning/crash-course/numerical-data/first-steps), you learned how to do the following:
- Visualize your data in plots or graphs.
- Evaluate potential features and labels mathematically.
- Find [**outliers**](https://developers.google.com/machine-learning/glossary/#outliers) in the dataset.

This exercise takes you through the process of finding columns that contain blatant outliers, which you can then decide to keep in or delete from the dataset.

In [None]:
# @title Setup - Import relevant modules

# The following code imports relevant modules that
# allow you to run the colab.
# If you encounter technical issues running some of the code sections
# that follow, try running this section again.

import pandas as pd

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

In [None]:
#@title Import the dataset

# The following code imports the dataset that is used in the colab.

training_df = pd.read_csv(filepath_or_buffer="https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")

## Get basic statistics

In the following code section, the DataFrame `describe` method returns basic statistics on all the columns in the dataset, such as:

* `count` is the number of populated elements in this column. Ideally, every column contains the same value for `count`, but that's not always the case.
* `mean` is the traditional average of values in that column. We recommend comparing the `mean` to the median for each column. The **median** is the 50% row of the table.
* `std` is the standard deviation of the values in this column.
* `min`, `25%`, `50%`, `75%`, and `max` indicate values in the 0, 25, 50, 75, and 100th percentiles.

In [None]:
# Get statistics on the dataset.

# The following code returns basic statistics about the data in the dataframe.

training_df.describe()

### Task: Identify possible outliers

Based on the preceding statisics, do you see any columns that might contain outliers?

In [None]:
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import plotly.express as px
import numpy as np
import seaborn as sns

In [None]:
# Function to compute 75th / 25th percentile ratio or difference
def percentile_comparison(df, method='ratio'):
    """
    Computes the ratio or difference between the 75th and 25th percentile for numerical columns.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        method (str): 'ratio' for division (75th / 25th) or 'difference' for subtraction (75th - 25th).

    Returns:
        pd.Series: The computed values for each numerical column.
    """
    percentiles = df.describe(percentiles=[0.25, 0.75]).T[['25%', '75%']]

    if method == 'ratio':
        return percentiles['75%'] / percentiles['25%']
    elif method == 'difference':
        return percentiles['75%'] - percentiles['25%']
    else:
        raise ValueError("Invalid method. Use 'ratio' or 'difference'.")

comparison_results = percentile_comparison(training_df, method='ratio')  # or 'difference'
# print(comparison_results)

In [None]:
print("The outliers:")
print(comparison_results[comparison_results>1.5])

print("\nSome fields show regular distribution. In particular:")
print(comparison_results[comparison_results<1.5])

In [None]:
difference_results = percentile_comparison(training_df, method='difference')

print("The difference in spreads for those that show a larger ratio between the 75th and 25th percentile:")
print(difference_results[comparison_results>1.5])

In [None]:
# Create five 2D plots of the features against each other, color-coded by class.
for x_axis_data, y_axis_data in [
    ('longitude', 'latitude'),
    ('housing_median_age', 'median_house_value'),
    ('total_rooms', 'total_bedrooms'),
    ('population', 'households'),
    ('median_income', 'median_house_value'),
]:
  px.scatter(training_df, x=x_axis_data, y=y_axis_data,
           color='median_house_value', trendline="ols").show()

In [None]:
training_df[['median_income', 'median_house_value']].corr()

In [None]:
px.histogram(training_df, x='median_income', nbins=50).show()

In [None]:
px.histogram(training_df, x='median_house_value', nbins=50).show()

In [None]:
# @title Solution (run this code block to view) { display-mode: "form" }

print("""The following columns might contain outliers:

  * total_rooms
  * total_bedrooms
  * population
  * households
  * possibly, median_income

In all of those columns:

  * the standard deviation is almost as high as the mean
  * the delta between 75% and max is much higher than the
      delta between min and 25%.""")