<a href="https://colab.research.google.com/github/venkataratnamb20/pubdataml/blob/main/pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# colab- examples

## Pandas DataFrame: Create from lists of values

In [None]:
import pandas as pd

last_names = ['Connor', 'Connor', 'Reese']
first_names = ['Sarah', 'John', 'Kyle']
df = pd.DataFrame({
  'first_name': first_names,
  'last_name': last_names,
})
df

## Pandas DataFrame: Rename multiple Columns

In [None]:
import pandas as pd
df = pd.DataFrame({
    'Year': [2016, 2015, 2014, 2013, 2012],
    'Top Animal': ['Giant panda', 'Chicken', 'Pig', 'Turkey', 'Dog']
})

df.rename(columns={
    'Year': 'Calendar Year',
    'Top Animal': 'Favorite Animal',
}, inplace=True)
df

## Pandas DataFrame: Query by regexp (regular expression)

In [None]:
import pandas as pd
df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle', 'Joe'],
  'last_name': ['Connor', 'Connor', 'Reese', 'Bonnot'],
})

df[df.last_name.str.match('.*onno.*')]

## Pandas DataFrame: Query by variable value

Evaluate a variable as the value to find.

In [None]:
import pandas as pd
df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle'],
  'last_name': ['Connor', 'Connor', 'Reese'],
})

foo = 'Connor'
df.query('last_name == @foo')

## Pandas DataFrame: Query using variable value as a column name

Evaluate a variable, to use its value as the name of a column in a query.

E.g. Query for rows where `John` is the value in the column named `first_name`.

In [None]:
import pandas as pd
df = pd.DataFrame(data={
  'first_name': ['Sarah', 'John', 'Kyle'],
  'last_name': ['Connor', 'Connor', 'Reese'],
})

column_name = 'first_name'
df.query(f"`{column_name}` == 'John'")

## Pandas DataFrame: Query by Timestamp above a value

In [None]:
import pandas as pd
df = pd.DataFrame({
  'time': ['2022-09-14 00:52:00-07:00', '2022-09-14 00:52:30-07:00',
           '2022-09-14 01:52:30-07:00'],
  'letter': ['A', 'B', 'C'],
})
df['time'] = pd.to_datetime(df.time)

df.query('time >= "2022-09-14 00:52:30-07:00"')

## Pandas DataFrame: Query for Timestamp between two values

In [None]:
import pandas as pd
df = pd.DataFrame({
  'time': ['2022-09-14 00:52:00-07:00', '2022-09-14 00:52:30-07:00',
           '2022-09-14 01:52:30-07:00'],
  'letter': ['A', 'B', 'C'],
})
df['time'] = pd.to_datetime(df.time)

begin_ts = '2022-09-14 00:52:00-07:00'
end_ts = '2022-09-14 00:54:00-07:00'

df.query('@begin_ts <= time < @end_ts')

## Pandas DataFrame: Filter by Timestamp in DatetimeIndex using `.loc[]`

In [None]:
import pandas as pd
df = pd.DataFrame({
  'time': ['2022-09-14 00:52:00-07:00', '2022-09-14 00:52:30-07:00',
           '2022-09-14 01:52:30-07:00'],
  'letter': ['A', 'B', 'C'],
})
df['time'] = pd.to_datetime(df.time)
df.set_index('time', inplace=True)

df.loc['2022-09-14':'2022-09-14 00:53']

## Pandas DataFrame: Filter by Timestamp using TimeDelta string

In [None]:
import pandas as pd
df = pd.DataFrame({
  'time': ['2022-09-14 00:52:00-07:00', '2022-09-14 00:52:30-07:00',
           '2022-09-14 01:52:30-07:00'],
  'letter': ['A', 'B', 'C'],
})
df['time'] = pd.to_datetime(df.time)

def rows_in_time_range(df, time_column, start_ts_str, timedelta_str):
  # Return rows from df, where start_ts < time_column <= start_ts + delta.
  # start_ts_str can be a date '2022-09-01' or a time '2022-09-14 00:52:00-07:00'
  # timedelta_str examples: '2 minutes'  '2 days 2 hours 15 minutes 30 seconds'
  start_ts = pd.Timestamp(start_ts_str).tz_localize('US/Pacific')
  end_ts = start_ts + pd.to_timedelta(timedelta_str)
  return df.query("@start_ts <= {0} < @end_ts".format(time_column))

rows_in_time_range(df, 'time', '2022-09-14 00:00', '52 minutes 31 seconds')

## Pandas: Describe Timestamp values in a column

In [None]:
import pandas as pd
df = pd.DataFrame({
  'time': ['2022-09-14 00:52:00-07:00', '2022-09-14 00:52:30-07:00',
           '2022-09-14 01:52:30-07:00'],
  'letter': ['A', 'B', 'C'],
})
df['time'] = pd.to_datetime(df.time)

df['time'].describe(datetime_is_numeric=True)

## Pandas DataFrame: Explode a column containing dictionary values into multiple columns

This code transforms or splits the dictionary column into many columns.

E.g. The output DataFrame of this cell will have columns named [`date, letter, fruit, weather`].

In [None]:
import pandas as pd
df = pd.DataFrame({
  'date': ['2022-09-14', '2022-09-15', '2022-09-16'],
  'letter': ['A', 'B', 'C'],
  'dict' : [{ 'fruit': 'apple', 'weather': 'aces'},
            { 'fruit': 'banana', 'weather': 'bad'},
            { 'fruit': 'cantaloupe', 'weather': 'cloudy'}],
})

pd.concat([df.drop(['dict'], axis=1), df['dict'].apply(pd.Series)], axis=1)

## Pandas DataFrame: Extract values using regexp (regular expression)

In [None]:
import pandas as pd
df = pd.DataFrame({
  'request': ['GET /index.html?baz=3', 'GET /foo.html?bar=1'],
})

df['request'].str.extract('GET /([^?]+)\?', expand=True)

## Pandas Timestamp: Convert string to Timestamp, using date only

I.e. Midnight on the given date.

In [None]:
import pandas as pd

pd.Timestamp('9/27/22').tz_localize('US/Pacific')

## Pandas Timestamp: Convert string to Timestamp

In [None]:
import pandas as pd

pd.Timestamp('9/27/22 06:59').tz_localize('US/Pacific')

## Pandas: Create a TimeDelta using `unit`

From an integer.
`unit` is a string, defaulting to `ns`. Possible values:


In [None]:
import pandas as pd

pd.to_timedelta(1, unit='h')

## Pandas: Create a TimeDelta using available kwargs

Example keyworded args: {days, seconds, microseconds, milliseconds, minutes, hours, weeks}

In [None]:
import pandas as pd

pd.Timedelta(days=2)

## Pandas: Create a TimeDelta from a string

In [None]:
import pandas as pd

pd.Timedelta('2 days 2 hours 15 minutes 30 seconds')

## Pandas: Replace NaN values in a Column

In [None]:
import numpy as np
import pandas as pd
df = pd.DataFrame({
  'dogs': [5, 10, np.nan, 7],
})

df['dogs'].replace(np.nan, 0, regex=True)

## Pandas DataFrame: Drop duplicate rows

In [None]:
import pandas as pd
df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle', 'Joe'],
  'last_name': ['Connor', 'Connor', 'Reese', 'Bonnot'],
})
df.set_index('last_name', inplace=True)

df.loc[~df.index.duplicated(), :]

## Pandas DataFrame: Ignore one Column

In [None]:
import pandas as pd
df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle', 'Joe'],
  'last_name': ['Connor', 'Connor', 'Reese', 'Bonnot'],
})

df.loc[:, df.columns!='last_name']

## Pandas DataFrame: Intersect Indexes

In [None]:
import pandas as pd
terminator_df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle'],
  'last_name': ['Connor', 'Connor', 'Reese'],
})
terminator_df.set_index('first_name', inplace=True)

buckaroo_df = pd.DataFrame({
  'first_name': ['John', 'John', 'Buckaroo'],
  'last_name': ['Parker', 'Whorfin', 'Banzai'],
})
buckaroo_df.set_index('first_name', inplace=True)

terminator_df.index.intersection(buckaroo_df.index).shape

## Pandas DataFrame: Select all rows from A that are not in B, using the index

In [None]:
import pandas as pd
terminator_df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle'],
  'last_name': ['Connor', 'Connor', 'Reese'],
})
terminator_df.set_index('first_name', inplace=True)

buckaroo_df = pd.DataFrame({
  'first_name': ['John', 'John', 'Buckaroo'],
  'last_name': ['Parker', 'Whorfin', 'Banzai'],
})
buckaroo_df.set_index('first_name', inplace=True)

terminator_df[~terminator_df.index.isin(buckaroo_df.index)]

## Pandas DataFrame: Select rows by an attribute of a column value

Use the Series `map()` method.
E.g. To filter by the length of a column values:

In [None]:
import pandas as pd
df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle'],
  'last_name': ['Connor', 'Connor', 'Reese'],
})

df[df['last_name'].map(len) == 5]

## Pandas DataFrame: Sort the count of rows grouped on columns

In [None]:
import pandas as pd
df = pd.DataFrame({
  'first_name': ['Sarah', 'John', 'Kyle'],
  'last_name': ['Connor', 'Connor', 'Reese'],
})

df.groupby(['last_name']).size().sort_values(ascending=False)

## Pandas DataFrame: Reshape to have 1 row per value in a list column

Creates a new DataFrame that is a transformed version of the input. E.g.
*   Input: df with a column named `msg_ids` that is a list of values (i.e. many per row, at least in some rows).
*   Output: new_df which has 1 row per unique value found in any of the original `msg_ids` lists, with that value in a new column named `msg_id`.


In [None]:
import pandas as pd
df = pd.DataFrame({
  'date': ['9/1/22', '9/2/22', '9/3/22'],
  'action': ['Add', 'Update', 'Delete'],
  'msg_ids': [[1, 2, 3], [], [2, 3]],
})
df.set_index('date', inplace=True)


temp_series = df['msg_ids'].apply(pd.Series, 1).stack()
temp_series.index = temp_series.index.droplevel(-1)
temp_series.name = 'msg_id'
new_df = temp_series.to_frame()
new_df.set_index('msg_id', inplace=True)
new_df.loc[~new_df.index.duplicated(), :] # Drop duplicates.

## Pandas: DataFrames: Group Timeseries by Frequency

You can group timestamped data into intervals of arbitrary duration using a Grouper object to specify groupby instructions.  The `freq` parameter is a string that may contain an integer followed by an [offset alias](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).  E.g. To see output for 2 minute long intervals:

In [None]:
import pandas as pd
df = pd.DataFrame({
  'time': ['2022-09-01 00:00:01-07:00', '2022-09-01 00:00:02-07:00',
           '2022-09-01 00:01:00-07:00', '2022-09-01 00:02:00-07:00',
           '2022-09-01 00:03:00-07:00', '2022-09-01 00:04:00-07:00',
           '2022-09-01 00:05:00-07:00', '2022-09-01 00:07:00-07:00'],
  'requests': [1, 1, 1, 1, 1, 1, 1, 1],
})
df['time'] = pd.to_datetime(df.time)

df.groupby(pd.Grouper(key='time', freq='2min')).sum()

# vb-work

## Udacity: Artificial Intellegence
- [github/udacity-courses](https://github.com/udacity/AI_fundamentals/)
Books

- [artificial intellegence modern approach by russel](http://aima.cs.berkeley.edu/)
-

### Topics
- Advanced Search
- MinMax Algorithm
- Alpha-Beta Pruning
- Evaluation Functions
- Isolation Game Player

## Isolation Game

## AI With Azure

- [Artificial intelligence (AI) vs. machine learning (ML)](https://azure.microsoft.com/en-us/resources/cloud-computing-dictionary/artificial-intelligence-vs-machine-learning)
- [Artificial Intelligence For Industrial Applications](https://semiengineering.com/artificial-intelligence-for-industrial-applications/)
-  
-

### Responsible AI
- [Tools and practices](https://www.microsoft.com/en-us/ai/tools-practices)
- [microsoft: haxtoolkit-demo](https://www.microsoft.com/en-us/haxtoolkit/demo/)
- [Responsible AI with Azure](https://azure.microsoft.com/en-us/solutions/ai/responsible-ai-with-azure/)
-

### ML

Key Machine
Learning Concepts

Approaches
to ML

Core Tasks in Building a Solution

Azure ML + Automated ML
- Ingest and Prepare Data
- Feature Engineering + Feature Selection
- Model Training and Evaluation
- Model Deployment and Management
- Testing Deployed Models

Data process

Key Phases of the Data Science Process
There are five key steps to the data science process:

- `Collect data` We need to be able to collect reliable data and ensure that we understand its origin, quality, and meaning.
- ``Prepare data. The next phase in the data science process is data preparation. This can take up an enormous percentage of total time--estimates range upwards of 80-90%.
- `Train a model` In this phase, we apply appropriate algorithms to our data, resulting in a trained model. Depending on the algorithm, training may take a considerable amount of time. For example, certain applications of neural networks may take over a month to train fully. Generally, however, model training does not take that much time.
- `Evaluate the model` After we have trained a model, we want to ensure that it meets our expectations in terms of quality. Because the real world is always more complicated than our training data set, we want to make sure that the results look reasonable on non-training before pushing a model out to production. This helps us avoid overfitting to the training data.
- `Deploy the model` Having a model is great, but having it available for use is the natural next step. Historically, deploying a model typically meant rewriting it into a "production" development language like C or C++. Today, it is easy to run a microservice in a language like R or Python and handle prediction needs.


$$ Collect => data => Prepare => data => Train => model => Evaluate => Deploy $$
$$ model => Data => CSV => Trained Model => Model Evaluation => Data => CSV$$

### Choosing an Algorithm

- Accuracy
- Speed
- Explainability
- Existence

### Model Evaluation Metrics for Regression
| Metric | Best Use |
|--------|----------|
| R2 | Linear regression models |
| Mean Absolute Error (MAE) | Not much variance in observed values |
| Mean Absolute Percent Error (MAPE)  | Variance in actual values is high Actual value is never 0 |
| Root Mean Square Error (RMSE) | Larger discrepancies are way worse than smaller discrepancies |
| Root Mean Square Log Error (RMSLE) | Larger percentage discrepancies are way worse |


### Model Evaluation Metrics for Classification

Based on the type of model, there are several evaluation measures available. The most common classes of algorithm we want to test are `regression` and `classification`.

For classification, we have the `confusion matrix`, which allows us to define a variety of important measures. Three of these important measures are:

- Accuracy
- Precision
- Recall


### New Terms

- Microservice: A lightweight, independent service. Typically, microservices have one job and communicate with each other using well-defined operations.
- Label: The thing we want to predict.
- Feature: Inputs which help us understand what affects the label.
- Overfitting: A situation which happens when a trained model latches onto the particular relationships within a training data set, but those particulars are not always indicative of the broader world.
- R^2 (R-squared): An evaluation measure for linear regression models which ranges from 0-1, where 1 is the highest possible score.
- Mean Absolute Error (MAE): An evaluation measure for any regression model. It is the average difference between predicted and actual values. This works well when dealing with small ranges of numbers.
- Mean Absolute Percent Error (MAPE): An evaluation measure for any regression model. It is the percentage difference between the predicted and actual values. If the actual value is 0, MAPE will fail with a divide by 0 error, so it is not a good measure if the actual value can be 0. MAPE works best when you have large ranges of numbers.
- Root Mean Square Error (RMSE): An evaluation measure for any regression model. RMSE works best when you are concerned with large differences between the predicted and actual values.
- Root Mean Square Log Error (RMSLE): An evaluation measure for any regression model. RMSLE works best when you are concerned with large percentage differences between the predicted and actual values.
- Confusion matrix: A table representing predicted versus actual values for a classification problem. A classic two-class confusion matrix has four boxes. Using "Yes" and "No" as the two classes, these four boxes are:
- True Positive: we predicted Yes correctly
- False Positive: we predicted Yes but it was really No
- False Negative: we predicted No but it was really Yes
- True Negative: we predicted No correctly
- Accuracy (classification): A measure which is defined as the number of correct predictions divided by the total number of predictions.
- Precision: A measure which calculates how frequently our predicted value is correct. It is defined as True Positives / (True Positives + False Positives).
- Recall: A measure which calculates how frequently we correctly predict a value. It is defined as True Positives / (True Positives + False Negatives).
- Receiver Operating Characteristic (ROC) curve: A plot which represents true positive versus false positive rates for a two-class model.
- Area Under the Curve: the percentage of area underneath the ROC curve. This is a measure of how accurate the two-class model is, with numbers closer to 1 being better.

### There are four approaches to machine learning:

- Supervised learning: We have a known good answer for our label. The most common examples of this include classification and regression. This is by far the most common type of machine learning in the business world, as we generally develop models to solve known business problems, such as forecasting how much revenue the company will earn in the next quarter.
- Unsupervised learning: We do not have labels for our data. We use unsupervised learning techniques to try to discover what those labels should be. Clustering is the most common example of this.
- Semi-supervised learning: We have a small percentage of data with labels and a large percentage of unlabeled data. Perform supervised learning against the labeled data and then cluster the unlabeled data to find the nearest labeled points.
- Reinforcement learning: We train an agent to observe its environment and use those environmental clues to make a decision. For example, we might train a robot to sweep through a house in the least amount of time without getting stuck or sweeping over the same spots too frequently.

*Supervised Learning*

We have a known good answer for our label
Use this to train a model to predict labels for new data

- Classification
- Regression
- Anomaly Detection
- Similarity Learning
- Representation Learning

*Unsupervised Learning*

Do not have known good answers for the problem we want to solve
Train a model to discover patterns that humans did not define

- Clustering
- Anomaly
- Detection
- Representation Learning

*Semi-Supervised Learning*

- Small amount of labeled data and a large amount of unlabeled data.
- Learn from the labeled data
- Apply what we learn to the "nearest" unlabeled data

*Reinforcement Learning*

- Train an agent to observe its environment
- Use those environmental clues to make a decision
- Agent is rewarded based on nearness to the optimal choice

*Summary*
There are five core tasks involved in building a machine learning solution:

- Ingest and prepare data
- Feature selection and feature engineering
- Model training and evaluation
- Model deployment and management
- Testing deployed models

### New Terms

- `Azure Machine Learning Studio:` The integrated development environment (IDE) for Azure Machine Learning.
- `Pipeline (Azure ML):` A collection of components connected together in a defined order. The metaphor represents how data moves from a source (an initial dataset) and flows through components until it reaches a destination. There are two types of pipeline:` training pipelines and inference pipelines.
- `Run (Azure ML):` An attempt to train a model in Azure Machine Learning. This can be done through a pipeline in the Azure ML designer or through Automated ML.
- `Experiment (Azure ML):` A collection of trials used to validate a user's hypothesis. An experiment may contain multiple runs of pipelines.
- `Compute (Azure ML):` Virtual machine resources which are dedicated to performing tasks in Azure Machine Learning. Compute may include individual virtual machines (VMs), typically configured as data science VMs, or it may include a cluster of VMs intended for training and inference pipeline executions.
- `Data Labeling:` This functionality allows you to label images as part of an image classification project.
- `Linked Services:` This functionality allows you to integrate Azure Machine Learning with other Azure services. At present, the only linked service offering is to connect to Azure Synapse Analytics, which is a modern data warehousing offering on Azure.
- `Pipeline Asset:` A component available within Azure Machine Learning. This includes datasets you have imported, sample datasets which come with the service, and different components to transform, train, evaluate, and deploy models.
- `Node (input, output):` An input or output connection point on a component. Each component will have 0 to 3 input nodes and 0 to 3 output nodes. Each input or output node has a specific type, such as DataFrameDirectory, TransformationDirectory, or UntrainedModelDirectory. An input of DataFrameDirectory can only attach to an output of the same type.
- `Source node:` A node with no inputs. An example of a source node is any dataset you bring onto the canvas.
- `Sink node:` A node with no outputs. An example of a sink node is Web Service Output.



### Glossary

For your reference, here are all the new terms we introduced in this lesson:

- Accuracy (classification): A measure which is defined as the number of correct predictions divided by the total number of predictions.
- Area Under the Curve: the percentage of area underneath the ROC curve. This is a measure of how accurate the two-class model is, with numbers closer to 1 being better.
- Azure Machine Learning Studio: The integrated development environment (IDE) for Azure Machine Learning.
- Compute (Azure ML): Virtual machine resources which are dedicated to performing tasks in Azure Machine Learning. Compute may include individual virtual machines (VMs), typically configured as data science VMs, or it may include a cluster of VMs intended for training and inference pipeline executions.
- Confusion matrix: A table representing predicted versus actual values for a classification problem. A classic two-class confusion matrix has four boxes. Using "Yes" and "No" as the two classes, these four boxes are:
- True Positive: we predicted Yes correctly
- False Positive: we predicted Yes but it was really No
- False Negative: we predicted No but it was really Yes
- True Negative: we predicted No correctly
- Data Labeling: This functionality allows you to label images as part of an image classification project.
- Experiment (Azure ML): A collection of trials used to validate a user's hypothesis. An experiment may contain multiple runs of pipelines.
- Feature: Inputs which help us understand what affects the label.
- Feature engineering: Creating new features from existing data. This might include calculating new features, translating a street address into latitude and longitude, or parsing passages of text for meaning.
- Feature selection: Removing a column from consideration when training a model.
- Label: The thing we want to predict.
- Linked Services: This functionality allows you to integrate Azure Machine Learning with other Azure services. At present, the only linked service offering is to connect to Azure Synapse Analytics, which is a modern data warehousing offering on Azure.
- Mean Absolute Error (MAE): An evaluation measure for any regression model. It is the average difference between predicted and actual values. This works well when dealing with small ranges of numbers.
- Mean Absolute Percent Error (MAPE): An evaluation measure for any regression model. It is the percentage difference between the predicted and actual values. If the actual value is 0, MAPE will fail with a divide by 0 error, so it is not a good measure if the actual value can be 0. MAPE works best when you have large ranges of numbers.
- Microservice: A lightweight, independent service. Typically, microservices have one job and communicate with each other using well-defined operations.
- Node (input, output): An input or output connection point on a component. Each component will have 0 to 3 input nodes and 0 to 3 output nodes. Each input or output node has a specific type, such as DataFrameDirectory, TransformationDirectory, or UntrainedModelDirectory. An input of DataFrameDirectory can only attach to an output of the same type.
- Overfitting: A situation which happens when a trained model latches onto the particular relationships within a training data set, but those particulars are not always indicative of the broader world.
- Pipeline (Azure ML): A collection of components connected together in a defined order. The metaphor represents how data moves from a source (an initial dataset) and flows through components until it reaches a destination. There are two types of pipeline: training pipelines and inference pipelines.
- Pipeline Asset: A component available within Azure Machine Learning. This includes datasets you have imported, sample datasets which come with the service, and different components to transform, train, evaluate, and deploy models.
- Precision: A measure which calculates how frequently our predicted value is correct. It is defined as True Positives / (True Positives + False Positives).
- R^2 (R-squared): An evaluation measure for linear regression models which ranges from 0-1, where 1 is the highest possible score.
- Recall: A measure which calculates how frequently we correctly predict a value. It is defined as True Positives / (True Positives + False Negatives).
- Receiver Operating Characteristic (ROC) curve: A plot which represents true positive versus false positive rates for a two-class model.
- Reinforcement learning: A machine learning technique in which we train an agent to observe its environment and use those environmental clues to make a decision.
- Root Mean Square Error (RMSE): An evaluation measure for any regression model. RMSE works best when you are concerned with large differences between the predicted and actual values.
- Root Mean Square Log Error (RMSLE): An evaluation measure for any regression model. RMSLE works best when you are concerned with large percentage differences between the predicted and actual values.
- Run (Azure ML): An attempt to train a model in Azure Machine Learning. This can be done through a pipeline in the Azure ML designer or through Automated ML.
- Semi-supervised learning: A machine learning technique in which we have a small percentage of data with labels and a large percentage of unlabeled data.
- Sink node: A node with no outputs. An example of a sink node is Web Service Output.
- Source node: A node with no inputs. An example of a source node is any dataset you bring onto the canvas.
- Supervised learning: A machine learning technique in which we have a known good answer for our label and attempt to learn from this label for inference purposes. The most common examples of this include classification and regression.
- Unsupervised learning: A machine learning technique in which we do not have labels for our data. We use unsupervised learning techniques to try to discover what those labels should be. Clustering is the most common example of this.

### References
- [Pixel-level land cover classification](https://github.com/Azure/pixel_level_land_classification)
- [Genetic Algorithms in Antennas and Smart Antennas Design Overview: Two Novel Antenna Systems for Triband GNSS Applications and a Circular Switched Parasitic Array for WiMax Applications Developments with the Use of Genetic Algorithms](https://www.hindawi.com/journals/ijap/2014/729208/)
- [Genetic Algorithms](https://www2.econ.iastate.edu/tesfatsi/holland.gaintro.htm)
- [15 Real-World Applications of Genetic Algorithms](https://www.brainz.org/15-real-world-applications-genetic-algorithms/)
- [What is Genetic Programming?](https://www.genetic-programming.com/gpanimatedtutorial.html)
- [What is the Team Data Science Process?](https://learn.microsoft.com/en-us/azure/architecture/data-science-process/overview)
- [Machine Learning Algorithm Cheat Sheet for Azure Machine Learning designer](https://learn.microsoft.com/en-us/azure/machine-learning/algorithm-cheat-sheet?view=azureml-api-1)
- [Algorithm & component reference for Azure Machine Learning designer](https://learn.microsoft.com/en-us/azure/machine-learning/component-reference/component-reference?view=azureml-api-2&viewFallbackFrom=azureml-api-1)
- [Confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)
- [Evaluate automated machine learning experiment results](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml?view=azureml-api-2)
- [MLOps](https://www.datarobot.com/platform/mlops/?redirect_source=algorithmia.com)
- [](https://towardsdatascience.com/a-beginners-guide-to-reinforcement-learning-with-a-mario-bros-example-fa0e0563aeb7)
- [Data featurization in automated machine learning](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-features?view=azureml-api-1)
- [ML.NET: An open source and cross-platform machine learning framework](https://dotnet.microsoft.com/en-us/apps/machinelearning-ai/ml-dotnet)
- [What is Machine Learning Server](https://learn.microsoft.com/en-us/machine-learning-server/what-is-machine-learning-server)
- [Boosted Decision Tree Regression component](https://learn.microsoft.com/en-us/azure/machine-learning/component-reference/boosted-decision-tree-regression?view=azureml-api-2)

### Computer Vision
What Is Computer Vision?

Use of Al to extract latent information from visual inputs

- Analyzes visual information from images, video files, and cameras
- Aims to imitate how the human brain processes visual data
- Complexity added by multiple types of information in images
  - Detect objects, faces, inferred context and actions, all in a single image

What Problems Does Computer Vision Address?

- Semantic segmentation
- Object detection
- Image classification
- Facial recognition
- Optical character recognition (OCR)

Computer Vision in the Real World

Widely used and provides many benefits

- Cellular phones
- Safety Security
- Social media
- Autonomous vehicles

### Learning Objectives

At the end of this lesson, you will be able to:

- Extract insights from images
. Build custom image classification solutions
- Detect, analyze, and recognize faces
- Analyze text using optical character recognition
- Create solutions to read typed and handwritten documents
- Extract the key-value pair and tables in forms
- Add Azure Computer Vision services into applications

Computer vision attempts to provide solutions for several core workloads. Each workload is geared towards handling the different types of data that can appear in visual inputs. These workloads include:

- Content Tagging is the process of analyzing images to identify well-known objects and attaching labels to each of them.
- Object Detection refers to the process of identifying entities contained within an image.
- Describe Images involves using artificial intelligence to understand the salient content in a photograph.
- Image Classification uses machine learning models to classify or categorize photos based on their primary subject matter.
- Facial Recognition refers to detecting human faces within an image and then analyzing and identifying those faces.
- Read Text uses artificial intelligence to extract text from images and documents using optical character recognition or OCR.



### Resources
- [customvision](https://www.customvision.ai/)
- [Computer Vision API (v3.1)](https://westcentralus.dev.cognitive.microsoft.com/docs/services/computer-vision-v3-1-ga/operations/56f91f2e778daf14a499f21b)
- [Azure AI Services](https://azure.microsoft.com/en-us/products/ai-services/)
- [(Quickstart: Azure AI Vision v3.2 GA Read](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/quickstarts-sdk/client-library?pivots=programming-language-csharp&tabs=windows%2Cvisual-studio)
- [Image segmentation](https://en.wikipedia.org/wiki/Image_segmentation)
- [ML.NET Tutorial - Get started in 10 minutes](https://dotnet.microsoft.com/en-us/learn/ml-dotnet/get-started-tutorial/intro)

## ML for Trading

Three parts of teh course

1. Manipulating Financial Data in Python
2. Computational Investing
3. Learning Algorithms for Trading

Books
- [Python For Finance, HillPish](https://github.com/yhilpisch/py4fi2nd)
- [What Hedge Funds Really Do, Philip J. Romeo]()
- [Machine Learning, Mitchell]()

In [None]:
"""Bollinger Bands."""

import os
import pandas as pd
import matplotlib.pyplot as plt

def symbol_to_path(symbol, base_dir="data"):
    """Return CSV file path given ticker symbol."""
    return os.path.join(base_dir, "{}.csv".format(str(symbol)))


def get_data(symbols, dates):
    """Read stock data (adjusted close) for given symbols from CSV files."""
    df = pd.DataFrame(index=dates)
    if 'SPY' not in symbols:  # add SPY for reference, if absent
        symbols.insert(0, 'SPY')

    for symbol in symbols:
        df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
                parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
        df_temp = df_temp.rename(columns={'Adj Close': symbol})
        df = df.join(df_temp)
        if symbol == 'SPY':  # drop dates SPY did not trade
            df = df.dropna(subset=["SPY"])

    return df


def plot_data(df, title="Stock prices"):
    """Plot stock prices with a custom title and meaningful axis labels."""
    ax = df.plot(title=title, fontsize=12)
    ax.set_xlabel("Date")
    ax.set_ylabel("Price")
    plt.show()


def get_rolling_mean(values, window):
    """Return rolling mean of given values, using specified window size."""
    return pd.rolling_mean(values, window=window)


def get_rolling_std(values, window):
    """Return rolling standard deviation of given values, using specified window size."""
    # TODO: Compute and return rolling standard deviation


def get_bollinger_bands(rm, rstd):
    """Return upper and lower Bollinger Bands."""
    # TODO: Compute upper_band and lower_band
    return upper_band, lower_band


def test_run():
    # Read data
    dates = pd.date_range('2012-01-01', '2012-12-31')
    symbols = ['SPY']
    df = get_data(symbols, dates)

    # Compute Bollinger Bands
    # 1. Compute rolling mean
    rm_SPY = get_rolling_mean(df['SPY'], window=20)

    # 2. Compute rolling standard deviation
    rstd_SPY = get_rolling_std(df['SPY'], window=20)

    # 3. Compute upper and lower bands
    upper_band, lower_band = get_bollinger_bands(rm_SPY, rstd_SPY)

    # Plot raw SPY values, rolling mean and Bollinger Bands
    ax = df['SPY'].plot(title="Bollinger Bands", label='SPY')
    rm_SPY.plot(label='Rolling mean', ax=ax)
    upper_band.plot(label='upper band', ax=ax)
    lower_band.plot(label='lower band', ax=ax)

    # Add axis labels and legend
    ax.set_xlabel("Date")
    ax.set_ylabel("Price")
    ax.legend(loc='upper left')
    plt.show()


if __name__ == "__main__":
    test_run()
