# Q1.Generate a list of 100 integers containing values between 90 to 130 and store it in the variable `int_list`. After generating the list, find the following:

In [None]:
import math

def generate_int_list(start, end):
    """Generates a list of integers within the specified range."""
    return list(range(start, end + 1))

# Generate the list of integers between 30 and 90
int_list = generate_int_list(30, 91)

# (i) Mean
def calculate_mean(numbers):
    if not numbers:
        return None
    return sum(numbers) / len(numbers)

# (ii) Median
def calculate_median(numbers):
    numbers.sort()
    n = len(numbers)
    if n == 1:
        return numbers[0]
    elif n % 2 == 0:
        return (numbers[n // 2 - 1] + numbers[n // 2]) / 2
    else:
        return numbers[n // 2]

# (iii) Mode
def calculate_mode(numbers):
    counts = {}
    for num in numbers:
        counts[num] = counts.get(num, 0) + 1
    max_count = max(counts.values())
    modes = [num for num, count in counts.items() if count == max_count]
    return modes

# (iv) Weighted Mean
def calculate_weighted_mean(values, weights):
    if len(values) != len(weights):
        return None
    numerator = sum(value * weight for value, weight in zip(values, weights))
    denominator = sum(weights)
    return numerator / denominator

# (v) Geometric Mean
def calculate_geometric_mean(numbers):
    if not numbers or any(num <= 0 for num in numbers):
        return None
    product = 1
    for num in numbers:
        product *= num
    return product ** (1 / len(numbers))

# (vi) Harmonic Mean
def calculate_harmonic_mean(numbers):
    if not numbers or any(num <= 0 for num in numbers):
        return None
    reciprocals = [1 / num for num in numbers]
    return len(numbers) / sum(reciprocals)

# (vii) Midrange
def calculate_midrange(numbers):
    if not numbers:
        return None
    return (min(numbers) + max(numbers)) / 2

# (viii) Trimmed Mean
def calculate_trimmed_mean(numbers, percentage):
    if not numbers:
        return None
    n = len(numbers)
    k = int(n * percentage / 2)
    if k >= n // 2:
        return None  # Not enough data points to trim
    numbers.sort()
    return calculate_mean(numbers[k:-k])

print("Int List:", int_list)
print("\n\n")
print("Mean:", calculate_mean(int_list))
print("\n\n")
print("Median:", calculate_median(int_list))
print("\n\n")
print("Mode:", calculate_mode(int_list))
print("\n\n")
print("Weighted Mean:", calculate_weighted_mean(int_list, [1] * len(int_list)))  # assuming equal weights
print("\n\n")
print("Geometric Mean:", calculate_geometric_mean(int_list))
print("\n\n")
print("Harmonic Mean:", calculate_harmonic_mean(int_list))
print("\n\n")
print("Midrange:", calculate_midrange(int_list))
print("\n\n")
print("Trimmed Mean:", calculate_trimmed_mean(int_list, 20))  # trimming 20% of the data points

# Q2.Generate a list of 500 integers containing values between 200 to 300 and store it in the variable `int_list2`.After generating the list, find the following

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, kde

def generate_int_list(start, end, size):
    return np.random.randint(start, end + 1, size)

# Generate the list of integers between 200 and 300
int_list2 = generate_int_list(200, 300, 500)

# (i) Visualization
def visualize_data(data):
    plt.figure(figsize=(12, 4))

    # Frequency histogram
    plt.subplot(1, 3, 1)
    plt.hist(data, bins=30, density=True, alpha=0.7)
    plt.title("Frequency Distribution")

    # Gaussian distribution
    mean, std = np.mean(data), np.std(data)
    x = np.linspace(min(data), max(data), 100)
    y = norm.pdf(x, mean, std)
    plt.plot(x, y, color='red', label=f"Gaussian (μ={mean:.2f}, σ={std:.2f})")
    plt.legend()

    # KDE plot
    kde = np.histogram(data, bins=30, density=True)[0]
    x_kde = np.linspace(min(data), max(data), 30)
    plt.plot(x_kde, kde, color='green', label="KDE")
    plt.legend()

    plt.tight_layout()
    plt.show()

visualize_data(int_list2)

# (ii) Range
def calculate_range(numbers):
    return max(numbers) - min(numbers)

# (iii) Variance and Standard Deviation
def calculate_variance_and_std_dev(numbers):
    mean = np.mean(numbers)
    variance = np.sum((numbers - mean) ** 2) / len(numbers)
    std_dev = np.sqrt(variance)
    return variance, std_dev

# (iv) Interquartile Range (IQR)
def calculate_iqr(numbers):
    q1 = np.quantile(numbers, 0.25)
    q3 = np.quantile(numbers, 0.75)
    return q3 - q1

# (v) Coefficient of Variation
def calculate_coefficient_of_variation(numbers):
    mean = np.mean(numbers)
    std_dev = np.std(numbers)
    return (std_dev / mean) * 100

# (vi) Mean Absolute Deviation (MAD)
def calculate_mad(numbers):
    mean = np.mean(numbers)
    mad = np.mean(np.abs(numbers - mean))
    return mad

# (vii) Quartile Deviation
def calculate_quartile_deviation(numbers):
    q1 = np.quantile(numbers, 0.25)
    q3 = np.quantile(numbers, 0.75)
    quartile_deviation = (q3 - q1) / 2
    return quartile_deviation

# (viii) Range-based Coefficient of Dispersion
def calculate_range_based_coefficient_of_dispersion(numbers):
    range_ = calculate_range(numbers)
    mean = np.mean(numbers)
    coefficient_of_dispersion = range_ / mean
    return coefficient_of_dispersion

# Example usage
print("Range:", calculate_range(int_list2))
print("\n\n")
print("Variance and Standard Deviation:", calculate_variance_and_std_dev(int_list2))
print("\n\n")
print("IQR:", calculate_iqr(int_list2))
print("\n\n")
print("Coefficient of Variation:", calculate_coefficient_of_variation(int_list2))
print("\n\n")
print("MAD:", calculate_mad(int_list2))
print("\n\n")
print("Quartile Deviation:", calculate_quartile_deviation(int_list2))
print("\n\n")
print("Range-based Coefficient of Dispersion:", calculate_range_based_coefficient_of_dispersion(int_list2))

# 3.Write a Python class representing a discrete random variable with methods to calculate its expected value and variance

In [None]:
class DiscreteRandomVariable:
    def __init__(self, values, probabilities):

        self.values = values
        self.probabilities = probabilities

    def expected_value(self):
        return sum(x * p for x, p in zip(self.values, self.probabilities))

    def variance(self):
        mean = self.expected_value()
        return sum((x - mean) ** 2 * p for x, p in zip(self.values, self.probabilities))

    def __str__(self):
        return f"Discrete Random Variable: values={self.values}, probabilities={self.probabilities}"

# Example usage:
values = [1, 2, 3, 4, 5]
probabilities = [0.1, 0.2, 0.3, 0.2, 0.2]
rv = DiscreteRandomVariable(values, probabilities)
print(rv)
print("\n\n")
print("Expected Value:", rv.expected_value())
print("\n\n")
print("Variance:", rv.variance())

# 4.Implement a program to simulate the rolling of a fair six-sided die and calculate the expected value and variance of the outcomes

In [None]:
import random

def simulate_die_rolls(num_rolls):
    rolls = []
    for _ in range(num_rolls):
        rolls.append(random.randint(1, 6))
    return rolls

def calculate_expected_value_and_variance(rolls):
    expected_value = sum(rolls) / len(rolls)
    variance = sum((roll - expected_value) ** 2 for roll in rolls) / len(rolls)
    return expected_value, variance

# Simulate rolling a die 1000 times
num_rolls = 1000
rolls = simulate_die_rolls(num_rolls)

# Calculate expected value and variance
expected_value, variance = calculate_expected_value_and_variance(rolls)

print("Expected Value:", expected_value)
print("\n\n")
print("Variance:", variance)

# 5.Create a Python function to generate random samples from a given probability distribution (e.g., binomial, Poisson) and calculate their mean and variance

In [None]:
import numpy as np
from scipy.stats import binom, poisson

def generate_samples(dist, *args, **kwargs):
    if dist == 'binom':
        n, p = args
        samples = binom.rvs(n, p, size=1000)
    elif dist == 'poisson':
        mu = args[0]
        samples = poisson.rvs(mu, size=1000, **kwargs)
    else:
        raise ValueError("Unsupported distribution")

    mean = np.mean(samples)
    variance = np.var(samples)

    return samples, mean, variance

# Example usage:
# Binomial distribution
n = 10
p = 0.5
samples, mean, variance = generate_samples('binom', n, p)
print("Binomial Distribution:")
print("\n\n")
print("Samples:", samples)
print("\n\n")
print("Mean:", mean)
print("\n\n")
print("Variance:", variance)
print("\n\n")

# Poisson distribution
mu = 5
samples, mean, variance = generate_samples('poisson', mu, loc=0)
print("\nPoisson Distribution:")
print("\n\n")
print("Samples:", samples)
print("\n\n")
print("Mean:", mean)
print("\n\n")
print("Variance:", variance)

# 6.Write a Python script to generate random numbers from a Gaussian (normal) distribution and compute the mean, variance, and standard deviation of the samples

In [None]:
import numpy as np

def generate_normal_samples(mean, std_dev, size):

    samples = np.random.normal(mean, std_dev, size)
    mean_sample = np.mean(samples)
    variance_sample = np.var(samples)
    std_dev_sample = np.std(samples)

    return samples, mean_sample, variance_sample, std_dev_sample

# Example usage:
mean = 0
std_dev = 1
size = 1000

samples, mean_sample, variance_sample, std_dev_sample = generate_normal_samples(mean, std_dev, size)

print("Gaussian Distribution:")
print("\n\n")
print("Samples:", samples)
print("\n\n")
print("Mean:", mean_sample)
print("\n\n")
print("Variance:", variance_sample)
print("\n\n")
print("Standard Deviation:", std_dev_sample)

# 7.Use seaborn library to load tips dataset. Find the following from the dataset for the columns total_bill and tip`

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the tips dataset
tips = sns.load_dataset('tips')

# Function to calculate skewness
def calculate_skewness(series):
    return series.skew()

# Calculate skewness for total_bill and tip columns
total_bill_skewness = calculate_skewness(tips['total_bill'])
tip_skewness = calculate_skewness(tips['tip'])

print("Skewness for total_bill column:", total_bill_skewness)
print("Skewness for tip column:", tip_skewness)

# Determine skewness type
def determine_skewness_type(skewness):
    if skewness > 0:
        return "Positive skewness"
    elif skewness < 0:
        return "Negative skewness"
    else:
        return "Approximately symmetric"

print("\nSkewness type for total_bill column:", determine_skewness_type(total_bill_skewness))
print("Skewness type for tip column:", determine_skewness_type(tip_skewness))

# Function to calculate covariance
def calculate_covariance(series1, series2):
    return np.cov(series1, series2)[0, 1]

# Calculate covariance between total_bill and tip columns
covariance = calculate_covariance(tips['total_bill'], tips['tip'])
print("\nCovariance between total_bill and tip columns:", covariance)

# Function to calculate Pearson correlation coefficient
def calculate_pearson_correlation(series1, series2):
    return series1.corr(series2)

# Calculate Pearson correlation coefficient between total_bill and tip columns
pearson_correlation = calculate_pearson_correlation(tips['total_bill'], tips['tip'])
print("\nPearson correlation coefficient between total_bill and tip columns:", pearson_correlation)

# Visualize correlation between total_bill and tip columns using scatter plot
plt.scatter(tips['total_bill'], tips['tip'])
plt.xlabel('Total Bill')
plt.ylabel('Tip')
plt.title('Correlation between Total Bill and Tip')
plt.show()

# 8.Write a Python function to calculate the probability density function (PDF) of a continuous random variable for a given normal distributionn

In [None]:
import numpy as np
import scipy.stats as stats

def calculate_normal_pdf(x, mean, std_dev):
    return stats.norm.pdf(x, loc=mean, scale=std_dev)

# Example usage:
mean = 0
std_dev = 1
x = 1.5
pdf_value = calculate_normal_pdf(x, mean, std_dev)
print("PDF value at x =", x, ":", pdf_value)

# 9.Create a program to calculate the cumulative distribution function (CDF) of exponential distribution

In [None]:
import numpy as np
import scipy.stats as stats

def calculate_exponential_cdf(x, rate):

    return stats.expon.cdf(x, scale=1/rate)

# Example usage:
rate = 2.0
x = 1.5
cdf_value = calculate_exponential_cdf(x, rate)
print("CDF value at x =", x, ":", cdf_value)

# 10. Write a Python function to calculate the probability mass function (PMF) of Poisson distribution

In [None]:
def calculate_poisson_pmf(k, mu):
    return stats.poisson.pmf(k, mu)

k_values = np.array([3, 4, 5])
mu = 5.0
pmf_values = calculate_poisson_pmf(k_values, mu)
print("PMF values:", pmf_values)

# 11. A company wants to test if a new website layout leads to a higher conversion rate (percentage of visitors who make a purchase). They collect data from the old and new layouts to comparee

In [None]:
import numpy as np
from scipy.stats import norm

# Define the sample data
old_layout = np.array([1] * 50 + [0] * 950)
new_layout = np.array([1] * 70 + [0] * 930)

# Calculate the sample proportions
p_old = np.mean(old_layout)
p_new = np.mean(new_layout)

# Calculate the standard error
se_old = np.sqrt(p_old * (1 - p_old) / len(old_layout))
se_new = np.sqrt(p_new * (1 - p_new) / len(new_layout))

# Calculate the z-score
z_score = (p_new - p_old) / np.sqrt(se_old**2 + se_new**2)

# Calculate the p-value
p_value = norm.sf(abs(z_score))

print("Sample proportions:")
print("\n\n")
print("Old layout:", p_old)
print("\n\n")
print("New layout:", p_new)
print("\n\n")

print("Z-score:", z_score)
print("\n\n")
print("P-value:", p_value)

# 12. A tutoring service claims that its program improves students' exam scores. A sample of students who participated in the program was taken, and their scores before and after the program were recorded

In [None]:
import numpy as np
from scipy.stats import norm

# Define the sample data
before_program = np.array([75, 80, 85, 70, 90, 78, 92, 88, 82, 87])
after_program = np.array([80, 85, 90, 80, 92, 80, 95, 90, 85, 88])

# Calculate the sample means
mu_before = np.mean(before_program)
mu_after = np.mean(after_program)

# Calculate the standard error
se_before = np.std(before_program) / np.sqrt(len(before_program))
se_after = np.std(after_program) / np.sqrt(len(after_program))

# Calculate the z-score
z_score = (mu_after - mu_before) / np.sqrt(se_before**2 + se_after**2)

# Calculate the p-value
p_value = norm.sf(abs(z_score))

print("Sample means:")
print("\n\n")
print("Before program:", mu_before)
print("\n\n")
print("After program:", mu_after)
print("\n\n")

print("Z-score:", z_score)
print("\n\n")
print("P-value:", p_value)

# 13.A pharmaceutical company wants to determine if a new drug is effective in reducing blood pressure. They conduct a study and record blood pressure measurements before and after administering the drugg

In [None]:
import numpy as np
from scipy.stats import norm

# Define the sample data
before_drug = np.array([145, 150, 140, 135, 155, 160, 152, 148, 130, 138])
after_drug = np.array([130, 140, 132, 128, 145, 148, 138, 136, 125, 130])

# Calculate the sample means
mu_before = np.mean(before_drug)
mu_after = np.mean(after_drug)

# Calculate the standard error
se_before = np.std(before_drug) / np.sqrt(len(before_drug))
se_after = np.std(after_drug) / np.sqrt(len(after_drug))

# Calculate the z-score
z_score = (mu_before - mu_after) / np.sqrt(se_before**2 + se_after**2)

# Calculate the p-value
p_value = norm.sf(abs(z_score))

print("Sample means:")
print("\n\n")
print("Before drug:", mu_before)
print("\n\n")
print("After drug:", mu_after)
print("\n\n")


print("Z-score:", z_score)
print("\n\n")
print("P-value:", p_value)

# 14. A customer service department claims that their average response time is less than 5 minutes .A sample of recent customer interactions was taken, and the response times were recorded

In [None]:
import numpy as np
from scipy.stats import norm

# Define the sample data
response_times = np.array([4.3, 3.8, 5.1, 4.9, 4.7, 4.2, 5.2, 4.5, 4.6, 4.4])

# Calculate the sample mean
mu = np.mean(response_times)

# Calculate the standard error
se = np.std(response_times) / np.sqrt(len(response_times))

# Calculate the z-score
z_score = (mu - 5) / se

# Calculate the p-value
p_value = norm.sf(abs(z_score))

print("Sample mean:", mu)
print("\n\n")
print("Z-score:", z_score)
print("\n\n")
print("P-value:", p_value)

# 15. A company is testing two different website layouts to see which one leads to higher click-through rates.Write a Python function to perform an A/B test analysis, including calculating the t-statistic, degrees of freedom, and p-value

In [None]:
import numpy as np
from scipy.stats import ttest_ind

def ab_test_analysis(layout_a_clicks, layout_b_clicks):

    # Calculate sample means and standard deviations
    mean_a = np.mean(layout_a_clicks)
    mean_b = np.mean(layout_b_clicks)
    std_a = np.std(layout_a_clicks, ddof=1)
    std_b = np.std(layout_b_clicks, ddof=1)

    # Calculate t-statistic and p-value
    t_stat, p_value = ttest_ind(layout_a_clicks, layout_b_clicks)

    # Calculate degrees of freedom
    df = len(layout_a_clicks) + len(layout_b_clicks) - 2

    # Print results
    print("Sample means:")
    print("\n\n")
    print("Layout A:", mean_a)
    print("\n\n")
    print("Layout B:", mean_b)
    print("\n\n")

    print("T-statistic:", t_stat)
    print("\n\n")
    print("Degrees of freedom:", df)
    print("\n\n")
    print("P-value:", p_value)

    # Interpret results
    if p_value < 0.05:
        print("The difference in click-through rates between the two layouts is statistically significant.")
    else:
        print("The difference in click-through rates between the two layouts is not statistically significant.")

# Example usage with sample data (replace with your actual data)
layout_a_clicks = [10, 12, 8, 15, 9]
layout_b_clicks = [13, 11, 14, 10, 12]

ab_test_analysis(layout_a_clicks, layout_b_clicks)

# 16. A pharmaceutical company wants to determine if a new drug is more effective than an existing drug in reducing cholesterol levels.Create a program to analyze the clinical trial data and calculate the tstatistic and p-value for the treatment effect

In [None]:
import numpy as np
from scipy.stats import ttest_ind

def clinical_trial_analysis(existing_drug_levels, new_drug_levels):
    # Calculate the sample means
    mean_existing = np.mean(existing_drug_levels)
    mean_new = np.mean(new_drug_levels)

    # Calculate the sample standard deviations
    std_existing = np.std(existing_drug_levels, ddof=1)
    std_new = np.std(new_drug_levels, ddof=1)

    # Calculate the t-statistic
    t_stat, p_value = ttest_ind(existing_drug_levels, new_drug_levels)

    # Calculate the degrees of freedom
    df = len(existing_drug_levels) + len(new_drug_levels) - 2

    print("Sample means:")
    print("\n\n")
    print("Existing drug:", mean_existing)
    print("\n\n")
    print("New drug:", mean_new)
    print("\n\n")

    print("T-statistic:", t_stat)
    print("\n\n")
    print("Degrees of freedom:", df)
    print("\n\n")
    print("P-value:", p_value)

    if p_value < 0.05:
        print("The new drug is statistically significantly more effective in reducing cholesterol levels.")
    else:
        print("The new drug is not statistically significantly more effective in reducing cholesterol levels.")

# Call the function with the provided data
existing_drug_levels = [180, 182, 175, 185, 178, 176, 172, 184, 179, 183]
new_drug_levels = [170, 172, 165, 168, 175, 173, 170, 178, 172, 176]
clinical_trial_analysis(existing_drug_levels, new_drug_levels)

# 17.A school district introduces an educational intervention program to improve math scores.Write a Python function to analyze pre- and post-intervention test scores, calculating the t-statistic and p-value to determine if the intervention had a significant impact

In [None]:
import scipy.stats


## for paired test 

def Scores_t_test(pre,post):
    t_stats,p_value=scipy.stats.ttest_rel(pre,post)
    dof=len(pre)-1
    alpha=0.05

        
    print(f"t_stats : {t_stats}")
    print("\n\n")
    print(f"P value : {p_value}")
    print("\n\n")
    print(f"degrees of freedom : {dof}")
    print("\n\n")
    print("__________________________________")
    print("\n\n")

    if p_value<alpha:
        print("Reject the null hypothisis")
    else:
        print("Fail to reject null hypothsis")
    


    
    
Pre_intervention_scores= [80, 85, 90, 75, 88, 82, 92, 78, 85, 87]

post_intervention_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]

Scores_t_test(Pre_intervention_scores,post_intervention_scores)


# 18. An HR department wants to investigate if there's a gender-based salary gap within the company. Develop a program to analyze salary data, calculate the t-statistic, and determine if there's a statistically significant difference between the average salaries of male and female employees.

Use the below code to generate synthetic data:

Generate synthetic salary data for male and female employees

np.random.seed(0) # For reproducibility

male _ salaries = np.random.normal(loc=50000, scale=10000, size=20)

female_salaries = np.random.normal(loc=55000, scale=9000, size=20)

In [None]:
# Generate synthetic salary data for male and female employees

np.random.seed(0) # For reproducibility

male_salaries = np.random.normal(loc=50000, scale=10000, size=20)

female_salaries = np.random.normal(loc=55000, scale=9000, size=20)

male_mean=np.mean(male_salaries)
male_std=np.std(male_salaries,ddof=1)  ##  ddof=1 For sample standard daviation 
female_mean=np.mean(female_salaries)
female_std=np.std(female_salaries,ddof=1)


# Perform two-sample t-test
t_test,P_val=scipy.stats.ttest_ind(male_salaries,female_salaries,equal_var=False)
alpha=0.05
print(f"t-stats : {t_test}\n\n")
print(f"PValue : {P_val}\n\n")
if P_val<alpha:
    print("Reject null hypothisis\n\n")
else:
    print("Fail to reject the null hypothsis")

# 19. A manufacturer produces two different versions of a product and wants to compare their quality scores.Create a Python function to analyze quality assessment data, calculate the t-statistic, and decide whether there's a significant difference in quality between the two versionsns

In [None]:

def manufacture_varsions(v1,v2):
    # perform two smaple t-test
    t_test,p_val=scipy.stats.ttest_ind(v1,v2)

    alpha=0.05
    print(f"t-stats : {t_test}\n\n")
    print(f"PValue : {p_val}\n\n")
    print("_____________________________")
    print("\n\n")
    if p_val<alpha:
        print("Reject null hypothisis")
    else:
        print("Fail to reject the null hypothsis\n\n")

versionl_scores=[85, 88, 82, 89, 87, 84, 90, 88, 85, 86, 91, 83, 87, 84, 89, 86, 84, 88, 85, 86, 89, 90, 87, 88, 85]

version2_scores= [80, 78, 83, 81, 79, 82, 76, 80, 78, 81, 77, 82, 80, 79, 82, 79, 80, 81, 79, 82, 79, 78, 80, 81, 82]

manufacture_varsions(versionl_scores,version2_scores)

# 20. A restaurant chain collects customer satisfaction scores for two different branches. Write a program to analyze the scores, calculate the t-statistic, and determine if there's a statistically significant difference in customer satisfaction between the branches

In [None]:
def satisfaction_customers_analysis(r1,r2):
    # perform two smaple t-test
    t_test,p_val=scipy.stats.ttest_ind(r1,r2)

    alpha=0.05
    print(f"t-stats : {t_test}\n\n")
    print(f"PValue : {p_val}\n\n")
    print("_____________________________")
    print("\n\n")
    if p_val<alpha:
        print("Reject null hypothisis\n\n")
    else:
        print("Fail to reject the null hypothsis")

branch_a_scores = [4, 5, 3, 4, 5, 4, 5, 3, 4, 4, 5, 4, 4, 3, 4, 5, 5, 4, 3, 4, 5, 4, 3, 5, 4, 4, 5, 3, 4, 5, 4]
branch_b_scores = [3, 4, 2, 3, 4, 3, 4, 2, 3, 3, 4, 3, 3, 2, 3, 4, 4, 3, 2, 3, 4, 3, 2, 4, 3, 3, 4, 2, 3, 4, 3]
satisfaction_customers_analysis(branch_a_scores,branch_b_scores)

# 21. A political analyst wants to determine if there is a significant association between age groups and voter preferences (Candidate A or Candidate B). They collect data from a sample of 500 voters and classify them into different age groups and candidate preferences. Perform a Chi-Square test to determine if there is a significant association between age groups and voter preferencesces

In [None]:
np.random.seed(0)

age_groups = np.random.choice(['18-30', '31-50', '51+ ', '51+1'] , size=30)

voter_preferences = np.random.choice(['Candidate A','Candidate B'],size=30)
data=pd.DataFrame({'Age group':age_groups , 'voter prefrences':voter_preferences})

# pd.crosstab(data['Age group'],data['voter prefrences'])
data_table=pd.crosstab(data['Age group'], data['voter prefrences'])
data_table

# 22. A company conducted a customer satisfaction survey to determine if there is a significant relationship between product satisfaction levels (Satisfied, Neutral, Dissatisfied) and the region where customers are located (East, West, North, South). The survey data is summarized in a contingency table. Conduct a Chi-Square test to determine if there is a significant relationship between product satisfaction levels and customer regionsions

In [None]:
import numpy as np
from scipy.stats import chi2_contingency

# Sample data: Product satisfaction levels (rows) vs. Customer regions (columns)
data = np.array([[50, 30, 40, 20],   # Satisfied, Neutral, Dissatisfied (rows)
                 [30, 40, 30, 50],   # East, West, North, South (columns)
                 [20, 30, 40, 30]])  # Frequencies
data=data.transpose()
data
chi_stat, p, dof, expected=chi2_contingency(data)
alpha=0.05
if p< alpha :
    print("\n\nReject null hypothesis: There is a significant relationship between product satisfaction levels and customer regions\n\n")
else:
    print("Fail to reject null hypothesis: There is no significant relationship between product satisfaction levels and customer regions")

# 23. A company implemented an employee training program to improve job performance (Effective, Neutral, Ineffective). After the training, they collected data from a sample of employees and classified them based on their job performance before and after the training. Perform a Chi-Square test to determine if there is a significant difference between job performance levels before and after the traininging

In [None]:


from scipy.stats import chi2_contingency

# Sample data: Job performance levels before (rows) and after (columns) training
observed = np.array([[30, 20, 10],
                     [30, 40, 30],
                     [20, 30, 40]])

# Perform Chi-Square test
chi2, p, dof, expected = chi2_contingency(observed)

# Output results
print("Chi-Square Statistic:", chi2)
print("\n\n")
print("p-value:", p)
print("\n\n")
print("Degrees of Freedom:", dof)
print("\n\n")
print("Expected Frequencies Table:")
print("\n\n")
print(expected)

# Set significance level
alpha = 0.05

# Determine whether to reject null hypothesis
if p < alpha:
    print("\n\nReject the null hypothesis. There is a significant difference between job performance levels before and after training")
else:
    print("Fail to reject the null hypothesis. There is no significant difference between job performance levels before and after training")

# 24. A company produces three different versions of a product: Standard, Premium, and Deluxe. The company wants to determine if there is a significant difference in customer satisfaction scores among the three product versions. They conducted a survey and collected customer satisfaction scores for each version from a random sample of customers. Perform an ANOVA test to determine if there is a significant difference in customer satisfaction scoresores

In [None]:
import pandas as pd
from scipy import stats

# Sample data
standard_scores = [80, 85, 90, 78, 88, 82, 92, 78, 85, 87]
premium_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]
deluxe_scores = [95, 98, 92, 97, 96, 94, 98, 97, 92, 99]

# Create a pandas dataframe
data = pd.DataFrame({
    "Product": ["Standard"] * len(standard_scores) + ["Premium"] * len(premium_scores) + ["Deluxe"] * len(deluxe_scores),
    "Score": standard_scores + premium_scores + deluxe_scores
})

# Perform ANOVA test
fvalue, pval = stats.f_oneway(standard_scores, premium_scores, deluxe_scores)

# Print the results
print("F-statistic:", fvalue)
print("\n\n")
print("p-value:", pval)

# Interpretation
if pval < 0.05:
    print("\n\nReject null hypothesis. There is a significant difference in customer satisfaction scores among the product versions")
else:
    print("Fail to reject null hypothesis. There is no significant difference in customer satisfaction scores among the product versions.")