In [None]:
!pip install statsmodels
!pip install ucimlrepo
!pip install mlxtend
!pip install yellowbrick

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv()
data.info()
data.drop([], axis = 1, inplace = True)

In [None]:
numerical_columns = data.select_dtypes(include = ['int64', 'float64']).columns.tolist()
numerical_columns
data[numerical_columns]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

sc = StandardScaler()
data_scaled = sc.fit_transform(data)

# epsilon value
neighbor = NearestNeighbors(n_neighbors = 5)
neighbordist = neighbor.fit(data_scaled)
distance, indices = neighbordist.kneighbors(data_scaled)
distance

In [None]:
import matplotlib.pyplot as plt
distance = np.sort(distance, axis = 0)
plt.plot(distance[:, 4])
plt.show()

In [None]:
eps = [0.6, 0.7, 0.8, 0.9]
min_pts = [4, 5, 6] #2 * no. of dimention i.e. 2*3 = 6 to be chosen

from sklearn.cluster import DBSCAN
result = []
for e in eps:
    for n in min_pts:
        dbscan = DBSCAN(eps = e, min_samples = n)
        y_cluster = dbscan.fit_predict(data_scaled)
        n_cluster = len(set(y_cluster)) - (1 if -1 in y_cluster else 0)
        n_noise = list(y_cluster).count(-1)
        result.append((e, n, n_cluster, n_noise))

result_df = pd.DataFrame(result, columns = ['eps', 'minsamples', 'n_clusters', 'n_noise'])
result_df

WHAT IS LEARNING?
Ability to improve once behavior with experience
WHAT IS MACHINE LEARNING?
Explores algorithms that learn from Data, build models from Data and this model can be used for different tasks. Eg) Prediction, Decision making or solving tasks
Machine learning is the field of study that gives computers the ability to learn without being explicitly programmed
"A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P, if its performance at tasks in T, as measured by P, improves with experience E.“
•TASKS – T – behavior of the task that the learning program is seeking to improve. Like Prediction, Classification and acting in an environment
•Experience –E- Data: Used for improving the experience at the task
•Measure of improvement P – Increase accuracy or new skills to agents or increase efficiency of problem solving we can improve the performance measure
•Learner: Gains experience from data and incorporates background knowledge.
•Reasoner: Solves tasks or problems based on models built by the learner and delivers solutions or performance.
Key Elements:

    Experience on Data: Input from real-world data used for learning.
    Problem or Task: The challenge that needs solving, driving the reasoning process.
    Background Knowledge: Pre-existing knowledge that aids the learning process.
    Build Models: Collaboration between the learner and reasoner to produce models.
    Answer or Solution Performance: The measurable outcome of the reasoning process.

This framework is widely applied in fields like Machine Learning, Artificial Intelligence, and Knowledge-Based Systems, emphasizing how models are developed, learned, and used for decision-making.

In [None]:
db_model = DBSCAN(eps = 0.6, min_samples = 6)
y_cluster = db_model.fit_predict(data_scaled)

data['Clusters'] = y_cluster
cluster_analysis = data.groupby('Clusters').mean()
cluster_analysis

#with noise data points to identify outlier
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'],
            c = data['Clusters'], cmap = 'viridis')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('DBSCAN for Mall Customer')
plt.show()

#without noise data points
data = data[data['Clusters']!=-1]
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'],
            c = data['Clusters'], cmap = 'viridis')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('DBSCAN for Mall Customer')
plt.show()

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

ss_score = silhouette_score(data_scaled, y_cluster)
dv_score = davies_bouldin_score(data_scaled, y_cluster)
ch_score = calinski_harabasz_score(data_scaled, y_cluster)
print('Silhouette Score', ss_score)
print('Davis Bouldin', dv_score)
print('Calinski Score', ch_score)

MACHINE LEARNING VS TRADITIONAL PROGRAMMING
I. Approach to Problem-Solving:
•Traditional Programming:
•In traditional programming, a programmer writes code (rules) to solve a problem or perform a task.
•The input and the code produce the output.
•For instance, in a calculator program, the programmer explicitly codes all functionalities (like addition, subtraction) needed for the calculations.

•Machine Learning:
•In machine learning, the input (data) and the output (answers) are fed into an algorithm to create a model.
•The model, once trained, can take new inputs to predict or make decisions based on its learning.
•For example, in a machine learning model trained to recognize handwritten digits, the model learns from a dataset of digits and their labels.

II Data Dependency:

Traditional Programming:
It doesn't rely heavily on data.
The focus is more on logic and algorithms that don't change unless manually updated.

Machine Learning:
It is heavily dependent on data.
The quality and quantity of data fed into the model significantly affect its accuracy and efficiency.

III Flexibility and Adaptability:
•Traditional Programming:
•Changes in problem specifications or requirements necessitate changes in the program's code.
•The system does not adapt automatically to new scenarios.

•Machine Learning:
•Models can adapt to new data independently.
•They can learn from new data and improve over time, making them more flexible in handling unforeseen scenarios.

IV Complexity of Problems Solved:

•Traditional Programming:
•More effective for problems with clear rules and logic.
•Struggles with tasks that are too complex to manually code, like natural language processing or image recognition.

•Machine Learning:
•Excels at handling complex problems that are difficult to solve with traditional rule-based programming, especially when patterns or correlations are not immediately apparent.

V Output Predictability:
•
• Traditional Programming:
•Outputs are predictable and consistent for the same input and code.

•Machine Learning:
•Outputs can vary, and there is a probability of error or inaccuracy, especially in new or edge-case scenarios.

1. Supervised learning – Also called predictive learning.
•A machine predicts the class of unknown objects based on prior class-related information of similar objects.
2. Unsupervised learning – Also called descriptive learning.
•A machine finds patterns in unknown objects by grouping similar objects together.
3. Reinforcement learning – Also called as Trial and Error Learning
•A machine learns to act on its own to achieve the given goals.

•The training set given for supervised learning is the labeled dataset.
•Supervised learning tries to find the relationships between the feature set and the label set, which is the knowledge and properties we can learn from labeled dataset.
• If each feature vector x is corresponding to a label y e L, L = {l1, l1, ... lc}
•(c is usually ranged from 2 to a hundred), the learning problem is denoted as classification. On the other hand, if each feature vector x is corresponding to a real value y ЄR Î , the learning problem is defined as regression problem.
•The knowledge extracted from supervised learning is often utilized for prediction and recognition

Objective:
To evaluate a machine learning model's performance on a limited dataset by dividing it into multiple folds for training and testing.
Dataset Split:
       The dataset is split into two parts:
        Training subset: Used for building models.
        Test subset: Used for final evaluation.
k-Fold Process:
   The training subset is further divided into k equally sized folds (subsets).
    For each of the k runs (iterations):
        1 fold is used as the Test fold for validation.
        The remaining k-1 folds are used as Training folds to build the model.

Steps in Each Run:
    A model is trained on k−1.
    It is validated (performance measured) on the Test fold.
    The process is repeated k times, each time using a different fold as the test set.

Scores:
    After each run, the model’s performance score S1,S2,...,Sk is recorded.
    The final performance of the model is calculated as the average score across all k runs.

Model Selection and Evaluation:
    The model with the best performance metric is selected.
    The final model is built on the full training subset.
    The selected model is evaluated on the test subset to estimate the generalization error.

SUPERVISED LEARNING EXAMPLES
•Supervised learning is effective for a variety of business purposes, including sales forecasting, inventory optimization, and fraud detection. Some examples of use cases include:
•Predicting real estate prices
•Classifying whether bank transactions are fraudulent or not
•Finding disease risk factors
•Determining whether loan applicants are low-risk or high-risk
•Predicting the failure of industrial equipment's mechanical parts

UNSUPERVISED LEARNING ALGORITHM
•The training set given for unsupervised leaning is the unlabeled dataset.
• Unsupervised learning aims at clustering, probability density estimation, finding association among features, and dimensionality reduction.
• In general, an unsupervised algorithm may simultaneously learn more than one properties listed above, and the results from unsupervised learning could be further used for supervised learning.

Unsupervised learning examples
•Unsupervised algorithms are widely used to create descriptive models.
•Common applications also include clustering, which creates a model that groups objects together based on specific properties, and association, which identifies the rules between the clusters.
•A few example use cases include:
•Creating customer groups based on purchase behavior
•Grouping inventory according to sales and/or manufacturing metrics
•Pinpointing associations in customer data (for example, customers who buy a specific style of handbag might be interested in a specific style of shoe)

REINFORCEMENT LEARNING ALGORITHM

•RL is a machine learning paradigm where an agent learns from rewards obtained by performing a series of actions.
•The agent does not receive explicit instructions or correct/false labels for its actions.
•Feedback comes in the form of rewards, and the agent must explore and discover the optimal actions to achieve maximum rewards.
•Typical applications of reinforcement learning involve playing games (chess, Go, Atari video games) and some form of robots, e.g., drones, warehouse robots, and more recently self driving cars.
•Robotics: Training robots to perform tasks through trial and error.
•Game AI: Teaching agents to play games like chess or Go.
•Autonomous Vehicles: Learning to navigate environments safely and efficiently.
•Learning by interacting with an environment to maximize rewards.
•Example: Training a robot to walk or play chess.
•Common approaches: Q-Learning, Deep Q-Networks (DQN).

•Reinforcement learning examples
Practical applications for this type of machine learning are still emerging. Some examples of uses include:
•Teaching cars to park themselves and drive autonomously
•Dynamically controlling traffic lights to reduce traffic jams
•Training robots to learn policies using raw video images as input that they can use to replicate the actions they see
•

SEMI SUPERVISED LEARNING
•In addition to these three types, a fourth type of machine learning category, semi-supervised learning, has attracted increasing attention recently.
•It is defined between supervised and unsupervised learning, contains both labeled and unlabeled data, and jointly learns knowledge from them.

HYPOTHESIS SPACE and INDUCTIVE BIAS
•In inductive learning or prediction problems:
•Given examples / data
•Examples are of the form (x,y)
•x is the instances and y is the output
•This can be specified as (x,f(x)). We want to learn x
•For a classification problem:
•Classification – f(x): discrete
•Regression – f(x): continuous
•Probability estimations – f(x): probability (x)

FEATURE SPACE
•When we say we have to learn a function, it is a function of the features; so instances are described in terms of features.
•Features are properties that describe each instance.
•Each instance can be described in a quantitative manner using features.
•Often we have multiple features so we have what we call a feature vector, for example, for a particular instance we may be or a particular task we may be describing all the instances in terms of ten features, so the feature vector will be a one-dimensional vector of size 10.
•
•

APPLICATIONS OF MACHINE LEARNING
•After the field of machine learning was “founded” more than a half a century ago, we can now find applications of machine learning in almost every aspect of our life.
•Popular applications of machine learning include the following:
• Email spam detection
• Face detection and matching (e.g., iPhone X, Windows laptops, etc.)
• Web search (e.g., DuckDuckGo, Bing, Baidu, Google)
• Sports predictions
•
• Post office (e.g., sorting letters by zip codes)
• ATMs (e.g., reading checks)
• Credit card fraud
• Stock predictions
• Smart assistants (Apple Siri, Amazon Alexa, . . . )
• Product recommendations (e.g., Walmart, Netflix, Amazon)
• Self-driving cars (e.g., Uber, Tesla)
• Language translation (Google translate)
• Sentiment analysis
•Drug design
• Medical diagnoses

DOMAINS AND APPLICATIONS
MEDICINE:
•DIAGNOSE DISEASE
•Input symptoms, lab measurements, test results, DNA tests,….
•Output: one of set of possible disease or “none of the above”
•Data mine historical medical records to learn which future patients will respond best to which treatments.
VISION:
•Say what objects appear in an image
•Convert hand-written digits to characters 0…9
•Detect where objects appear in an image
ROBOT CONTROL:
•Design autonomous mobile robots that learn to navigate from their own experience.
NLP:
•Detect where entities are mentioned in NL
•Detect what facts are expressed in NL
•Detect if a product/movie review is positive, negative or neutral
•
•Speech Recognition
•Machine translation
Financial:
•Predict is a stock will rise or fall
•In the next few milliseconds
•Predict if a user will click on an ad or not
•In order to decide which ad to show
•
•

APPLICATION IN BUSINESS INTELLIGENCE
•Forecasting product sales, quantities taking seasonality and trend into account
•Identifying cross selling promotional opportunities for consumer goods
•Identify the price sensitivity of a consumer product and identify the optimum price point that maximizes net profit
•Optimizing product location at a super market retail outlet.
•Modeling variables impacting customers churn and refining strategy.
•
•
OTHER APPLICATIONS
•Fraud Detection: Credit card providers
•Determine whether or not someone will default on a home mortgage
•Understand consumer sentiment based on unstructured data
•Forecasting women’s conviction rates based on external macroeconomic factors
•

HOW TO CREATE A LEARNER?
1.Choose the training experience or the data. It is nothing but the features
2.Choose the target function on how we want to represent the model. Ie.) to be learned.
3.Choose how to represent the target function . Appropriate class of functions on the features. This class of function is called hypothesis language
4.Choose a Learning algorithm to infer the target function.

MACHINE LEARNING PIPELINE
•A machine learning pipeline is a way to codify and automate the workflow it takes to produce a machine learning model.
•Machine learning pipelines consist of multiple sequential steps that do everything from data extraction and preprocessing to model training and deployment.

I Data Ingestion: 

•The process begins with ingesting raw data from different sources, such as databases, files, or APIs.
•This step is crucial to ensure that the pipeline has access to relevant and up-to-date information.
•

II Data Preprocessing: 
•Raw data often contains noise, missing values, or inconsistencies.
•The preprocessing stage involves cleaning, transforming, and encoding the data, making it suitable for machine learning algorithms.
•Common preprocessing tasks include handling missing data, normalization, and categorical encoding.
•

III Feature Engineering: 
•In this stage, new features are created from the existing data to improve model performance.
•Techniques such as dimensionality reduction, feature selection, or feature extraction can be employed to identify and create the most informative features for the ML algorithm.
•Business knowledge can come in handy at this step of the pipeline.
•

IV Model Training: 
•The preprocessed data is fed into the chosen ML algorithm to train the model.
•The training process involves adjusting the model’s parameters to minimize a predefined loss function, which measures the difference between the model’s predictions and the actual values.
•

V Model Validation:
•To evaluate the model’s performance, a validation dataset (a portion of the data that the model never saw) is used.
•Metrics such as accuracy, precision, recall, or F1-score can be employed to assess how well the model generalizes to new (unseen data) in classification problems.

VI Hyper parameter Tuning:
•Hyper parameters are the parameters of the ML algorithm that are not learned during the training process but are set before training begins.
•Tuning hyper parameters involves searching for the optimal set of values that minimize the validation error and helps achieve the best possible model’s performance.

VII Model evaluation
• After training, the model's performance is assessed using a separate testing dataset or through cross-validation.
• Common evaluation metrics depend on the specific problem but may include accuracy, precision, recall, F1-score, mean squared error or others.
•
VIII Model deployment:
•Once a satisfactory model is developed and evaluated, it can be deployed to a production environment where it can make predictions on new, unseen data.
•Deployment may involve creating APIs and integrating with other systems.

IX Monitoring and maintenance:
•After deployment, it's important to continuously monitor the model's performance and retrain it as needed to adapt to changing data patterns.
•This step ensures that the model remains accurate and reliable in a real-world setting.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples,silhouette_score
from yellowbrick.cluster import KElbowVisualizer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data.head()

In [None]:
data.corr(numeric_only=True)

In [None]:
plt.figure(figsize=(10,6))
scatter = sns.scatterplot(x = data['Annual Income (k$)'], y = data['Spending Score (1-100)'], hue = data['Age'],
                          palette = 'viridis', size = data['Age'], sizes = (20, 200))
plt.title('Scatterplot between AI vs SS w.r.t Age')
plt.xlabel('Annual Income', fontsize = 12)
plt.ylabel('Spending Score', fontsize = 12)
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'Annual Income (k$)', y = 'Spending Score (1-100)', data = data, alpha = 0.8)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
data_final = data.drop(['Age', 'Genre'], axis = 1)
sc = StandardScaler()
data_scaled = sc.fit_transform(data_final)
sum_of_squared_distance = []
s_score = []
k_range = range(2, 10)
for k in k_range:
    km = KMeans(n_clusters = k, random_state = 42)
    km.fit(data_scaled)
    sum_of_squared_distance.append(km.inertia_)
    s_score.append(silhouette_score(data_scaled, km.labels_))
sum_of_squared_distance

In [None]:
plt.figure(figsize = (15, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range, sum_of_squared_distance, marker = 'o')
plt.title('Elbow Method - Inertia')
plt.xlabel('No. of clusters')
plt.ylabel('SS distance')
plt.subplot(1, 2, 2)
plt.plot(k_range, s_score, marker = 'o')
plt.title('Elbow Method - SS')
plt.xlabel('No. of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

BENEFITS OF PIPELINE
•Unattended runs
•Easy Debugging
•Easy tracking and versioning
•Fast execution
•Collaboration
•Reusability
•Heterogeneous Compute

•Benefits of Machine Learning Pipelines
•Unattended runs
•The pipeline allows to schedule different steps to run in parallel in a reliable and unattended way.
•It means you can focus on other tasks simultaneously when the process of data modeling and preparation is going on.

•Easy Debugging
•Using pipeline, there is a separate function for each task(such as different functions for data cleaning and data modeling).
•It becomes easy to debug the complete code and find out the issues in a particular step.

•Easy tracking and versioning
•We can use a pipeline to explicitly name and version the data sources, inputs, and output rather than manually tracking data and outputs for each iteration.

•Fast execution
•As we discussed above, in the ML pipeline, each part of the workflow acts as an independent element, which allows the software to run faster and generate an efficient and high-quality output.

•Collaboration
•Using pipelines, data scientists can collaborate over each phase of the ML design process and can also work on different pipeline steps simultaneously.


•Reusability
•We can create pipeline templates for particular scenarios and can reuse them as per requirement.
•For example, creating a template for retraining and batch scoring.
•

•Heterogeneous Compute
•We can use multiple pipelines which are reliably coordinated over heterogeneous computer resources as well as different storage locations.
• It allows making efficient use of resources by running separate pipelines steps on different computing resources, e.g., GPUs, Data Science VMs, etc.
•
CHALLANGES IN MACHINE LEARNING
•Do I have enough data?
•Is the data of sufficient quality?
•Errors in data. Eg. Age=255 ; noise in low resolution images
•Missing values
•Am I describing the data correctly?
•Are age and income enough? Should I look at Gender also?
•How should I represent age? As a number or as young, middle age , old?
•How good is a model?
•How do I choose a model?
•How confident can I be of the results?

In [1]:
model = KMeans(random_state = 42)
visulaizer = KElbowVisualizer(model, k = (2,8), metric = 'silhouette', timings = False)
visulaizer.fit(data_scaled)
visulaizer.poof()
e = visulaizer.elbow_value_
print(e)

NameError: name 'KMeans' is not defined

In [None]:
km = KMeans(n_clusters = e, random_state = 42)
y_label = km.fit_predict(data_scaled)
data['Clusters'] = y_label
y_label

In [None]:
print("Centroid",km.cluster_centers_)

In [None]:
data.drop('Genre', axis = 1, inplace = True)
cluster_analysis=data.groupby('Clusters').mean()
cluster_analysis

In [None]:
plt.figure(figsize = (18, 8))
df1 = data[data.Clusters == 0]
df2 = data[data.Clusters == 1]
df3 = data[data.Clusters == 2]
df4 = data[data.Clusters == 3]
df5 = data[data.Clusters == 4]
plt.scatter(df1['Annual Income (k$)'], df1['Spending Score (1-100)'], color = 'orange', label = 'Standard')
plt.scatter(df2['Annual Income (k$)'], df2['Spending Score (1-100)'], color = 'magenta', label = 'Careless')
plt.scatter(df3['Annual Income (k$)'], df3['Spending Score (1-100)'], color = 'green', label = 'Target Group')
plt.scatter(df4['Annual Income (k$)'], df4['Spending Score (1-100)'], color = 'red', label = 'Careful')
plt.scatter(df5['Annual Income (k$)'], df5['Spending Score (1-100)'], color = 'blue', label = 'Sensible')
plt.title('Cluster Result', fontweight = 'bold', fontsize = 20)
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.legend(fontsize = 15)
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
score_kmeans_ss = silhouette_score(data_scaled, km.labels_)
print(score_kmeans_ss)
score_kmeans_c = calinski_harabasz_score(data_scaled, km.labels_)
print(score_kmeans_c)
score_kmeans_d = davies_bouldin_score(data_scaled, km.labels_)
print(score_kmeans_d)

Silhouette Score: Measure how similar an object is within its cluster Value of this silhouette score is between -1 & +1 1 means it is a well seperated cluster 0 means overlapping clusters -1 means poor clustering

Calinski-Harabasz Score: 0 & +infinity variance ratio criteria ratio = between cluster dispersion/within cluster dispersion the value is high means better is the clustering

Davis Bouldin Score: 0 & +infinity average similarity between each cluster and its most similar cluster intra-cluster similarity and inter cluster differences This value should be lower for better clustering