### Euclidean

In [None]:
import numpy as np
from scipy.spatial.distance import mahalanobis

# 1. Euclidean Distance

def euclidean_distance(point1, point2):
    """Calculate the Euclidean distance between two points."""
    return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))

# Example
p1, p2 = [109, 2,8], [4, 6,7]
print(f"Euclidean Distance: {euclidean_distance(p1, p2):.2f}")

Euclidean Distance: 105.08


### Cosine Similarity

In [None]:

# 2. Cosine Similarity

def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Example
v1, v2 = [1, 0, -1], [-1, 0, 1]
print(f"Cosine Similarity: {cosine_similarity(v1, v2):.2f}")


### Mahalanobis Distance

In [None]:
# 3. Mahalanobis Distance

def mahalanobis_distance(point1, point2, cov_matrix):
    """Calculate the Mahalanobis distance between two points."""
    diff = np.array(point1) - np.array(point2)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    return np.sqrt(np.dot(np.dot(diff.T, inv_cov_matrix), diff))

# Example
data_points = np.array([[1, 2], [3, 4], [5, 6]])
cov_matrix = np.cov(data_points, rowvar=False)
p1, p2 = [1, 2], [5, 6]
print(f"Mahalanobis Distance: {mahalanobis_distance(p1, p2, cov_matrix):.2f}")


### Jaccard Similarity

In [None]:
# 4. Jaccard Similarity

def jaccard_similarity(set1, set2):
    """Calculate the Jaccard similarity between two sets."""
    set1, set2 = set(set1), set(set2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Example
s1, s2 = {1, 2, 3}, {2, 3, 4}
print(f"Jaccard Similarity: {jaccard_similarity(s1, s2):.2f}")


###  Hamming Distance

In [None]:
# 5. Hamming Distance

def hamming_distance(str1, str2):
    """Calculate the Hamming distance between two strings."""
    if len(str1) != len(str2):
        raise ValueError("Strings must be of the same length")
    return sum(ch1 != ch2 for ch1, ch2 in zip(str1, str2))

# Example
str1, str2 = "karolin", "kathrin"
print(f"Hamming Distance: {hamming_distance(str1, str2)}")


### Manhattan Distance

In [None]:
# 6. Manhattan Distance

def manhattan_distance(point1, point2):
    """Calculate the Manhattan distance between two points."""
    return np.sum(np.abs(np.array(point1) - np.array(point2)))

# Example
p1, p2 = [1, 2], [4, 6]
print(f"Manhattan Distance: {manhattan_distance(p1, p2):.2f}")


### Minkowski Distance

In [None]:
# 7. Minkowski Distance

def minkowski_distance(point1, point2, p):
    """Calculate the Minkowski distance between two points for a given p."""
    return np.power(np.sum(np.abs(np.array(point1) - np.array(point2))**p), 1/p)

# Example
p1, p2 = [1, 2], [4, 6]
print(f"Minkowski Distance (p=3): {minkowski_distance(p1, p2, 3):.2f}")

### Chebyshev Distance

In [None]:
# 8. Chebyshev Distance

def chebyshev_distance(point1, point2):
    """Calculate the Chebyshev distance between two points."""
    return np.max(np.abs(np.array(point1) - np.array(point2)))

# Example
p1, p2 = [1, 2], [4, 6]
print(f"Chebyshev Distance: {chebyshev_distance(p1, p2):.2f}")


### Bray-Curtis Dissimilarity

In [None]:
# 9. Bray-Curtis Dissimilarity

def bray_curtis_dissimilarity(vec1, vec2):
    """Calculate the Bray-Curtis dissimilarity between two vectors."""
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.sum(np.abs(vec1 - vec2)) / np.sum(np.abs(vec1 + vec2))

# Example
v1, v2 = [1, 2, 3], [4, 5, 6]
print(f"Bray-Curtis Dissimilarity: {bray_curtis_dissimilarity(v1, v2):.2f}")


## Correlation Methods : Pearson,Spearman, KendalTau

In [None]:
# 10. Pearson Correlation Coefficient

def pearson_correlation(vec1, vec2):
    """Calculate the Pearson correlation coefficient between two vectors."""
    vec1, vec2 = np.array(vec1), np.array(vec2)
    mean1, mean2 = np.mean(vec1), np.mean(vec2)
    numerator = np.sum((vec1 - mean1) * (vec2 - mean2))
    denominator = np.sqrt(np.sum((vec1 - mean1)**2) * np.sum((vec2 - mean2)**2))
    return numerator / denominator

# Example
v1, v2 = [1, 2, 3, 4], [1, 5, 7, 3]
print(f"Pearson Correlation Coefficient: {pearson_correlation(v1, v2):.2f}")

# 11. Spearman Correlation Coefficient

def spearman_correlation(vec1, vec2):
    """Calculate the Spearman correlation coefficient between two vectors."""
    vec1, vec2 = np.array(vec1), np.array(vec2)
    rank_vec1 = np.argsort(np.argsort(vec1))
    rank_vec2 = np.argsort(np.argsort(vec2))
    return pearson_correlation(rank_vec1, rank_vec2)

# Example
v1, v2 = [1, 2, 3, 4], [1, 5, 7, 3]
print(f"Spearman Correlation Coefficient: {spearman_correlation(v1, v2):.2f}")

# 12. Kendall Tau Correlation

def kendall_tau_correlation(vec1, vec2):
    """Calculate the Kendall Tau correlation coefficient between two vectors."""
    vec1, vec2 = np.array(vec1), np.array(vec2)
    n = len(vec1)
    concordant, discordant = 0, 0
    for i in range(n):
        for j in range(i + 1, n):
            concordant += (vec1[i] - vec1[j]) * (vec2[i] - vec2[j]) > 0
            discordant += (vec1[i] - vec1[j]) * (vec2[i] - vec2[j]) < 0
    return (concordant - discordant) / (0.5 * n * (n - 1))

# Example
v1, v2 = [1, 2, 3, 4], [1, 5, 7, 3]
print(f"Kendall Tau Correlation: {kendall_tau_correlation(v1, v2):.2f}")


Pearson Correlation Coefficient: 0.40
Spearman Correlation Coefficient: 0.40
Kendall Tau Correlation: 0.33


In [2]:
import numpy as np
from scipy.stats import pearsonr, kendalltau, spearmanr

def calculate_trend(data_row):
    """
    Calculate Pearson, Kendall, and Spearman correlations to assess trend in a data row.

    Parameters:
    - data_row (list or np.array): A 1D list or array of numerical values.

    Returns:
    - dict: Dictionary containing the correlation coefficients and p-values for each method.
    """
    # Generate indices (e.g., time or order)
    indices = np.arange(1, len(data_row) + 1)

    # Ensure the data row is a numpy array
    data_row = np.array(data_row)

    # Compute correlations
    pearson_corr, pearson_p = pearsonr(indices, data_row)
    kendall_corr, kendall_p = kendalltau(indices, data_row)
    spearman_corr, spearman_p = spearmanr(indices, data_row)

    # Return results as a dictionary
    return {
        "Pearson": {"correlation": pearson_corr, "p-value": pearson_p},
        "Kendall": {"correlation": kendall_corr, "p-value": kendall_p},
        "Spearman": {"correlation": spearman_corr, "p-value": spearman_p},
    }

# Example usage:
data_row = [3, 5, 7, 6, 8, 9]
trend = calculate_trend(data_row)
from pprint import pprint
pprint(trend)

{'Kendall': {'correlation': 0.8666666666666666,
             'p-value': 0.016666666666666666},
 'Pearson': {'correlation': 0.9402561526802476,
             'p-value': 0.005247368266448161},
 'Spearman': {'correlation': 0.942857142857143,
              'p-value': 0.004804664723032055}}


## Gower Score

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

def gower_distance_df(X_df, Y_df=None):
    """
    Compute Gower's distance between two datasets (X_df, Y_df).

    Parameters:
        X_df : pandas DataFrame, shape (n_samples, n_features)
            First data matrix (numerical and categorical).
        Y_df : pandas DataFrame, shape (m_samples, n_features), optional
            Second data matrix. If None, pairwise distance with X_df is computed.

    Returns:
        dist : array-like, shape (n_samples, m_samples)
            Gower distance matrix.
    """
    # Separate numerical and categorical columns
    num_cols = X_df.select_dtypes(include=[np.number]).columns
    cat_cols = X_df.select_dtypes(exclude=[np.number]).columns

    # Function to compute Gower distance for numerical features
    def numerical_distance(a, b):
        # Add epsilon to avoid division by zero
        range_val = np.max([a, b]) - np.min([a, b])
        range_val = range_val if range_val > 0 else 1e-6  # Avoid zero range
        return np.abs(a - b) / range_val

    # Function to compute Gower distance for categorical features
    def categorical_distance(a, b):
        return 0 if a == b else 1

    # Function to compute Gower distance between two samples
    def gower_single(x, y):
        # Convert numpy arrays (rows) into pandas Series to index by column names
        x_series = pd.Series(x, index=X_df.columns)
        y_series = pd.Series(y, index=X_df.columns)

        total_distance = 0
        # Calculate numerical distance
        for col in num_cols:
            total_distance += numerical_distance(x_series[col], y_series[col])

        # Calculate categorical distance
        for col in cat_cols:
            total_distance += categorical_distance(x_series[col], y_series[col])

        # Normalize by the number of features
        total_distance /= len(num_cols) + len(cat_cols)
        return total_distance

    # Apply cdist to compute pairwise Gower distance
    dist_matrix = cdist(X_df.values, Y_df.values if Y_df is not None else X_df.values, metric=lambda u, v: gower_single(u, v))

    return dist_matrix

# Create a sample dataset with both numerical and categorical features
data = {
    'age': [25, 35, 45, 55, 65, 22, 28, 40, 50, 60],
    'income': [30000, 40000, 50000, 60000, 70000, 32000, 34000, 45000, 54000, 65000],
    'product_type': ['A', 'B', 'A', 'B', 'A', 'C', 'A', 'C', 'B', 'C'],
    'account_status': ['active', 'inactive', 'active', 'inactive', 'active', 'active', 'inactive', 'active', 'inactive', 'active']
}

# Convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# Compute the Gower distance matrix
gower_distance_matrix = gower_distance_df(df)

# Display the Gower distance matrix
print("Gower Distance Matrix:")
print(gower_distance_matrix)


Gower Distance Matrix:
[[0.   1.   0.5  1.   0.5  0.75 0.75 0.75 1.   0.75]
 [1.   0.   1.   0.5  1.   1.   0.75 1.   0.5  1.  ]
 [0.5  1.   0.   1.   0.5  0.75 0.75 0.75 1.   0.75]
 [1.   0.5  1.   0.   1.   1.   0.75 1.   0.5  1.  ]
 [0.5  1.   0.5  1.   0.   0.75 0.75 0.75 1.   0.75]
 [0.75 1.   0.75 1.   0.75 0.   1.   0.5  1.   0.5 ]
 [0.75 0.75 0.75 0.75 0.75 1.   0.   1.   0.75 1.  ]
 [0.75 1.   0.75 1.   0.75 0.5  1.   0.   1.   0.5 ]
 [1.   0.5  1.   0.5  1.   1.   0.75 1.   0.   1.  ]
 [0.75 1.   0.75 1.   0.75 0.5  1.   0.5  1.   0.  ]]
