In [None]:
from typing import List
import math 
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
# from sklearn import preprocessing

In [None]:
def get_counts_per_columns(value_counts: pd.core.series.Series, nbr_of_rank=6): # video=6 AND article 3
    ranks = list(range(1, (nbr_of_rank + 1)))
    cols_values_tuple = [c for c in value_counts.items()]
    ranks_found = [v[0] for v in cols_values_tuple]
    ranks_not_found = [rank for rank in ranks if rank not in ranks_found]
    values_tuple_second = [(v, 0) for v in ranks_not_found]
    values_tuple_complete = cols_values_tuple + values_tuple_second

    values_tuple_complete = sorted(values_tuple_complete, key=lambda x: x[0])
    counts = [v[1] for v in values_tuple_complete]
    return counts


In [None]:
# prepare data for Chi-Square Goodness of Fit Test
def prepare_data(df: pd.DataFrame, nbr_of_rank=6):
    # remove user column
    if "user" in df.columns.values.tolist():
        df = df.drop(columns=['user'])
    
    # nbr_of_rank = 6
    factors_row = {}
    new_rows = []
    columns_refs = []
    for name in df.columns.values:
        r = get_counts_per_columns(df[name].value_counts(), nbr_of_rank=nbr_of_rank)
        factors_row[name] = r
        new_rows.append(r)
        columns_refs.append(name)

    columns_names = [f"rank {i}" for i in list(range(1, (nbr_of_rank + 1)))]
    df = pd.DataFrame(new_rows, columns=columns_names)
    df.insert(0, 'factors', columns_refs)

    return (df, columns_refs, new_rows, factors_row)


In [None]:
from scipy.stats import chi2_contingency

def get_chi_test_result(observed_info: list):
    # observed_info = [[100, 200, 300], [50, 60, 70]]

    # Print the observed frequencies
    # print("Observed frequencies:")
    # for row in observed_info:
    #     print(row)

    # Perform the chi-square test
    # stat, p, dof = chi2_contingency(observed_info)
    chi2, p, dof, expected = chi2_contingency(observed_info)

    # Set the significance level (alpha)
    significance_level = 0.05

    # Print the degree of freedom and p-value
    print("p-value:", p)
    print("\nDegree of freedom:", dof)
    print("chi2-value:", chi2)

    # Interpret the results
    if False:
        if p <= significance_level:
            print("Reject NULL HYPOTHESIS: There is a significant association between the variables OR factors.")
            # print("Reject the null hypothesis: There is a significant association between the variables.")
        else:
            print("ACCEPT NULL HYPOTHESIS: No significant association between the variables OR factors.")
            # print("Fail to reject the null hypothesis: There is no significant association between the variables.")


# Generate Sample of n (default 100) User Opinion Survey

In [None]:
import random

def save_csv(df: pd.DataFrame, filename: str = "sample_data.csv"):
    df.to_csv(filename, encoding='utf-8', index=False)

def generate_rows_data(columns_names: list, nbr_of_rank=6, nbr_of_sample = 100, csv_file = True):
    data = []
    for i in list(range(nbr_of_sample)):
        row = random.sample(range(1, (nbr_of_rank + 1)), nbr_of_rank) # random.sample(range(1, 7), 6)
        row = [f"Rank {i}" for i in row]
        data.append(row)
    
    df = pd.DataFrame(data, columns=columns_names)
    df.insert(0, 'user', range(1, len(df) + 1))

    if csv_file:
        save_csv(df=df)
    return df


# Getting Factor Weight for Ranking

In [None]:
columns_to_read_1 = ['Similarity Score', 'Creation date', 'No. of Views', 
                   "No. of Likes on YouTube", "Rating on CourseMapper",
                   "No of. Save on CourseMapper"
                ]
columns_to_read_2 = ['Similarity Score', "Rating on CourseMapper",
                   "No of. Save on CourseMapper"
                ]

nbr_of_rank = 6
# df = pd.read_excel("factors ranking 2 (Responses).xlsx", usecols=columns_to_read_1)
# df = pd.read_excel("factors ranking 2 (Responses)_3_f.xlsx", usecols=columns_to_read_2)
# df = generate_rows_data(columns_names=columns_to_read_1, nbr_of_rank=6, nbr_of_sample=100)
# df = generate_rows_data(columns_names=columns_to_read_2, nbr_of_rank=nbr_of_rank, nbr_of_sample=10)

ROOT = "/Users/wkana001/Desktop/work/tests/sch"
# df = pd.read_csv(f"{ROOT}/sample_data.csv", usecols=columns_to_read_1)
# df = pd.read_csv("sample_data.csv", usecols=columns_to_read_1)


columns_default = [  
                     "No. of Likes on YouTube", 
                     "Creation Date", 
                     "No. of Views on YouTube", 
                     "Similarity Score", 
                     "No. of Saves on ABC",
                     "User Ratings on ABC"
                ]
df = pd.read_csv("factors ranking.csv", usecols=columns_default)

def rename_label_rank(x):
    if x == "Rank 1":
        return 1
    elif x == "Rank 2":
        return 2
    elif x == "Rank 3":
        return 3
    elif x == "Rank 4":
        return 4
    elif x == "Rank 5":
        return 5
    elif x == "Rank 6":
        return 6
    return x

def reverse_rank_to_weight(x):
    if x == 1:
        return 6
    elif x == 2:
        return 5
    elif x == 3:
        return 4
    elif x == 4:
        return 3
    elif x == 5:
        return 2
    elif x == 6:
        return 1
    return x

for name in df.columns.values:
   df[name] = df[name].apply(lambda x: rename_label_rank(x))

df.tail(5)

In [None]:
df.shape

In [None]:
# # chisqt = pd.crosstab(BIKE.holiday, BIKE.weathersit, margins=True)
# # cross_tab = pd.crosstab(df['Similarity Score'], df['Creation date'], df['No. of Views']
# #                         # df['No. of Likes on YouTube'], df['Rating on CourseMapper'], 
# #                         # df['No of. Save on CourseMapper']
# #                         )
# # cross_tab = pd.crosstab(df['Similarity Score'], [df['Creation date'], df['No. of Views'], 
# #                                        df['No. of Likes on YouTube'], df['Rating on CourseMapper'], 
# #                                        df['No of. Save on CourseMapper']
# #                                        ]
# #                         )
# # cross_tab

# # data = df.head(5).to_dict(orient='list')

# import pandas as pd

# # Given dictionary with 'list' orientation
# data = {
#     'A': [2, 2, 2, 2, 2],
#     'B': [5, 4, 6, 5, 4],
#     'C': [4, 1, 1, 6, 3],
#     'D': [3, 6, 5, 3, 5],
#     'E': [1, 5, 4, 4, 1],
#     'F': [6, 3, 3, 1, 6]
# }

# # Convert dictionary to DataFraåme
# df = pd.DataFrame(data)

# # Perform cross-tabulation
# # cross_tab = pd.crosstab(df['A'], [df['B'], df['C'], df['D'], df['E'], df['F']])
# cross_tab = pd.crosstab(df['B'], df['C'])

# # Display the cross-tabulation
# print("Cross-tabulation:")
# cross_tab


In [None]:
# xxxx

In [None]:
ppd = prepare_data(df=df, nbr_of_rank=nbr_of_rank)
get_chi_test_result(observed_info=ppd[2])

ppd[0].head(6)

In [None]:
factor_highest_counts = {}
for k,v in ppd[3].items():
    factor_highest_counts[k] = max(v)

factor_weights = factor_highest_counts
factor_weights

In [None]:
def get_factor_weights(df: pd.DataFrame):
    # remove user column
    if "user" in df.columns.values.tolist():
        df = df.drop(columns=['user'])
        
    factors_weights_cols = {}
    for name in df.columns.values:
        df[name] = df[name].apply(lambda x: reverse_rank_to_weight(x))
        col_values = df[name].tolist()
        factors_weights_cols[name] = round((sum(col_values) / len(col_values)), 3)

    return factors_weights_cols

# factor_weights = get_factor_weights(df)
# factor_weights


In [None]:
import numpy as np
from sklearn.preprocessing import normalize as normalize_sklearn, MinMaxScaler as MinMaxScaler_sklearn

def normalize_factor_weights(factor_weights: dict=None, values: list=[], method_type = "l1", complete=True, sum_value=True): # List[float]
    """
    https://www.pythonprog.com/sklearn-preprocessing-normalize/#Normalization_Techniques
    TypeScript: https://sklearn.vercel.app/guide/install

    factor_weights = { 'similarity_score': 0.7, 'creation_date': 0.3, 'nbr_views': 0.3, 
            'nbr_likes_youTube': 0.1, 'rating_courseMapper': 0.1, 'nbr_save_courseMapper': 0.1
        }

    method_type: normalization techniques
        l1: L1 normalization, also known as L1 norm normalization or Manhattan normalization
        l1: L2 normalization, also known as L2 norm normalization or Euclidean normalization
        max: Max Normalization
        min-max: Min-Max
    """
    normalized_values = None
    scaled_data = None
    
    if factor_weights:
        values = [value for key, value in factor_weights.items()]
        key_names = [key for key, value in factor_weights.items()]

    if method_type == "l1":
        normalized_values = normalize_sklearn([values], norm=method_type).tolist()
    if method_type == "l2":
        normalized_values = normalize_sklearn([values], norm=method_type).tolist()
    if method_type == "max":
        normalized_values = normalize_sklearn([values], norm=method_type).tolist()
    if method_type == "min-max":
        data = np.array(values).reshape(-1, 1)
        scaler = MinMaxScaler_sklearn()
        scaler.fit(data)
        scaled_data = scaler.transform(data)
        scaled_data = scaled_data.tolist()
        scaled_data = [value[0] for value in scaled_data]

    if normalized_values:
        normalized_values = normalized_values[0]
        normalized_values = [round(value, 3) for value in normalized_values]
    elif scaled_data:
        normalized_values = scaled_data

    if sum_value:
        print("sun values: ", sum(normalized_values))

    if complete:
        normalized_values = dict(zip(key_names, normalized_values))
    
    return normalized_values

def get_factor_weight_by_scores_normalized(scores: list):
    sum_scores = sum(scores)
    weights_final = []
    for score in scores:
        cal = round((score / sum_scores), 3)
        weights_final.append(cal)
    return weights_final


In [None]:
# L1 normalization, also known as L1 norm normalization or Manhattan normalization
# values = [22, 28, 24, 25, 25, 21]
# values = [1, 0.6, 1, 0, 0, 0]

# normalized_weights = normalize_factor_weights(values=values, method_type="l1", complete=False, sum_value=True)
normalized_weights = normalize_factor_weights(factor_weights=factor_weights, method_type="l1", complete=False, sum_value=True)
print("normalized_weights: ", normalized_weights)
print(normalize_factor_weights(factor_weights=factor_weights, method_type="l1", complete=True, sum_value=True))

"""
# l1: L2 normalization, also known as L2 norm normalization or Euclidean normalization

normalized_weights = normalize_factor_weights(factor_weights=factor_weights, method_type="l2", complete=False, sum_value=False)
normalized_weights = get_factor_weight_by_scores_normalized(normalized_weights)
print("normalized_weights: ", normalized_weights)
print("sun: ", sum(normalized_weights))
print()

# max: Max Normalization (dividing by the highest value)

normalized_weights = normalize_factor_weights(factor_weights=factor_weights, method_type="max", complete=False, sum_value=False)
normalized_weights = get_factor_weight_by_scores_normalized(normalized_weights)
print("normalized_weights: ", normalized_weights)
print("sun: ", sum(normalized_weights))
print()

# min-max: Min-Max

normalized_weights = normalize_factor_weights(factor_weights=factor_weights, method_type="min-max", complete=False, sum_value=False)
normalized_weights = get_factor_weight_by_scores_normalized(normalized_weights)
print("normalized_weights: ", normalized_weights)
print("sun: ", sum(normalized_weights))

"""


In [None]:
for i, key in enumerate(factor_weights):
    col = [str(factor_weights[key])]
    col.append(normalized_weights[i])
    factor_weights[key] = col

df = pd.DataFrame(factor_weights)
new_column_data = ["highest count", "normalized score"]
new_column_name = ''
df.insert(0, new_column_name, new_column_data)
df

In [None]:
data = {
    "video": {
        "views": 0.2,
        "rating": 0.1,
        "creation_date": 0.3,
        "similarity_score": 0.1,
        "bookmark": 0.1,
        "like_count": 0.1
    },
    "article": {
        "rating": 0.4,
        "similarity_score": 0.3,
        "bookmark": 0.3
    }
}
weigths_normalized = {}
for k,v in data.items():
    weigths_normalized[k] = normalize_factor_weights(  factor_weights=v, 
                                                    method_type="l1", 
                                                    complete=True, 
                                                    sum_value=False
                                                )
 
result = {
    "original": data,
    "normalized": weigths_normalized
}
result

In [None]:
article = {
    # 'No. of Likes on YouTube': 22,
    # 'Creation Date': 31,
    # 'No. of Views on YouTube': 22,
    'Similarity Score': 23,
    'No. of Saves on ABC': 30,
    'User Ratings on ABC': 23
}

normalized_weights = normalize_factor_weights(factor_weights=article, method_type="l1", complete=False, sum_value=True)
print("normalized_weights: ", normalized_weights)
# print(normalize_factor_weights(factor_weights=factor_weights, method_type="l1", complete=True, sum_value=True))