# Import the necessary libraries

In [20]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from load_data import Load_Data
from fill_gender_based_on_title import Fill_gender_based_on_title
from data_processor import DataProcessor
from data_processor import HypothesisTester
from data_processor import ReportGenerator
from data_processor_for_postal_code import DataProcessor, Tester, ReportGenerator
from margin_analysis import MarginAnalysis


# Load data

In [11]:
# Create an instance of CSVReader
csv_reader = Load_Data('../data/cleaned_data.csv')

# Load the data
csv_reader.load_data()

# Get the loaded data
df = csv_reader.get_data()

# Now, you can use raw_data as needed

  self.data = pd.read_csv(self.file_path)


Data successfully loaded from ../data/cleaned_data.csv


# Fill gender columns as male and female

In [12]:
df = Fill_gender_based_on_title(df)

# Remove duplicates from the 'Province' column
df_unique = df.drop_duplicates(subset='Province')

# Print unique values in the 'Province' column
unique_provinces = df_unique['Province'].unique()
print(unique_provinces)

Gender
Male      934365
Female     65733
Name: count, dtype: int64
['Gauteng' 'KwaZulu-Natal' 'Mpumalanga' 'Eastern Cape' 'Western Cape'
 'Limpopo' 'North West' 'Free State' 'Northern Cape']


# A/B testing for Province

In [None]:
# Initialize DataProcessor
data_processor = DataProcessor(df)

# Select KPI (e.g., 'TotalClaims' or 'TotalPremium')
kpi = 'TotalClaims'  # You only need to pass the column name

# Segment data by multiple provinces (for testing risk differences across more than two provinces)
group_conditions = {
    'Gauteng': (df['Province'] == 'Gauteng'),
    'KwaZulu-Natal': (df['Province'] == 'KwaZulu-Natal'),
    'Western Cape': (df['Province'] == 'Western Cape'),  # Add as many provinces as needed
    'Mpumalanga': (df['Province'] == 'Mpumalanga'),  # Add as many provinces as needed
    'Limpopo': (df['Province'] == 'Limpopo'),  # Add as many provinces as needed
    'North West': (df['Province'] == 'North West'),  # Add as many provinces as needed
    'Free State': (df['Province'] == 'Free State'),  # Add as many provinces as needed
    'Northern Cape': (df['Province'] == 'Northern Cape'),  # Add as many provinces as needed
    'Eastern Cape': (df['Province'] == 'Eastern Cape')  # Add as many provinces as needed
}

# Segment data into multiple groups based on the provinces
groups = data_processor.segment_data('Province', group_conditions)

# Initialize the HypothesisTester
tester = HypothesisTester()

# Perform an ANOVA test between the groups on 'TotalClaims'
p_value = tester.anova_test(groups, kpi)

# Analyze the results
result = tester.analyze_results(p_value)
tester.add_result('Risk difference across provinces', p_value, result)

# Generate the report
report_generator = ReportGenerator(tester.results)
report = report_generator.generate_report()
print(report)


# A/B testing for Gender

In [9]:
# Initialize DataProcessor
data_processor = DataProcessor(df)

# Select KPI (e.g., 'TotalClaims' or 'TotalPremium')
kpi = 'TotalClaims'  # You only need to pass the column name

# Segment data by multiple provinces (for testing risk differences across more than two provinces)
group_conditions = {
    'Male': (df['Gender'] == 'Male'),
    'Female': (df['Gender'] == 'Female'),
}

# Segment data into multiple groups based on the provinces
groups = data_processor.segment_data('Gender', group_conditions)

# Initialize the HypothesisTester
tester = HypothesisTester()

# Perform an ANOVA test between the groups on 'TotalClaims'
p_value = tester.anova_test(groups, kpi)

# Analyze the results
result = tester.analyze_results(p_value)
tester.add_result('Risk difference across provinces', p_value, result)

# Generate the report
report_generator = ReportGenerator(tester.results)
report = report_generator.generate_report()
print(report)



A/B Hypothesis Testing Report
Hypothesis: Risk difference across provinces
P-Value: 0.032120655336210946
Result: Reject the null hypothesis



# A/B testing for Postalcode

In [14]:

# Prepare the data (bin the postal codes)
df = prepare_data(df)

# Instantiate the data processor
data_processor = DataProcessor(df)

# Define group conditions for segmenting the data based on postal code bins
bin_labels = df['PostalCodeBin'].unique()
group_conditions = {bin_label: (df['PostalCodeBin'] == bin_label) for bin_label in bin_labels}

# Segment the data based on postal code bins
groups = data_processor.segment_data('PostalCodeBin', group_conditions)

# Instantiate the Tester for t-tests
tester = Tester()

# Perform pairwise T-Tests between postal code bins
bin_labels = list(groups.keys())

for i in range(len(bin_labels) - 1):
    group1 = groups[bin_labels[i]]
    group2 = groups[bin_labels[i + 1]]
    
    # Perform the t-test
    p_value = tester.t_test(group1, group2, 'TotalClaims')
    
    # Analyze the results
    result = tester.analyze_results(p_value)
    description = f'T-Test between {bin_labels[i]} and {bin_labels[i + 1]}'
    
    # Add the result to the tester
    tester.add_result(description, p_value, result)
    
    # Generate and display the report
    report_generator = ReportGenerator(tester.results)
    report = report_generator.generate_report()
    print(report)
    

Description: T-Test between 1000-2000 and 2000-3000
P-value: 0.020925106031974526
Analysis Result: Significant difference

Description: T-Test between 2000-3000 and 4000-5000
P-value: 0.0641924603756658
Analysis Result: No significant difference

Description: T-Test between 4000-5000 and 6000-7000
P-value: 0.03210287845961992
Analysis Result: Significant difference

Description: T-Test between 6000-7000 and 7000-8000
P-value: 0.6739449548314396
Analysis Result: No significant difference

Description: T-Test between 7000-8000 and 9000-10000
P-value: 0.6290348230367926
Analysis Result: No significant difference

Description: T-Test between 9000-10000 and 3000-4000
P-value: 0.6603889348717926
Analysis Result: No significant difference

Description: T-Test between 3000-4000 and 5000-6000
P-value: 0.3583236224498323
Analysis Result: No significant difference

Description: T-Test between 5000-6000 and 8000-9000
P-value: 0.04858307600730958
Analysis Result: Significant difference



# A/B testing for margin (profit)

In [22]:
# Import necessary libraries
import pandas as pd
from scipy.stats import ttest_ind, f_oneway

# Sample DataFrame
df = pd.DataFrame({
    'PostalCode': [8000, 8000, 9000, 9000, 10000, 10000],
    'TotalPremium': [1000, 1500, 1200, 1600, 1100, 1300],
    'TotalClaims': [500, 700, 600, 900, 500, 800]
})

# Calculate the margin
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# T-test for two zip codes (e.g., 8000 vs 9000)
group_8000 = df[df['PostalCode'] == 8000]['Margin']
group_9000 = df[df['PostalCode'] == 9000]['Margin']
t_stat, p_value_ttest = ttest_ind(group_8000, group_9000, equal_var=False)

# Analyze the result
alpha = 0.05  # significance level
if p_value_ttest < alpha:
    result_ttest = "Reject the null hypothesis: There is a significant difference in margins between zip codes."
else:
    result_ttest = "Fail to reject the null hypothesis: There is no significant difference in margins between zip codes."

# Display the results
print(f"T-test p-value: {p_value_ttest}")
print(result_ttest)

# If there are more than two groups, use ANOVA
groups = [df[df['PostalCode'] == code]['Margin'] for code in df['PostalCode'].unique()]
f_stat, p_value_anova = f_oneway(*groups)

# Analyze ANOVA result
if p_value_anova < alpha:
    result_anova = "Reject the null hypothesis: There is a significant difference in margins across zip codes."
else:
    result_anova = "Fail to reject the null hypothesis: There is no significant difference in margins across zip codes."

# Display ANOVA results
print(f"ANOVA p-value: {p_value_anova}")
print(result_anova)


T-test p-value: 1.0
Fail to reject the null hypothesis: There is no significant difference in margins between zip codes.
ANOVA p-value: 0.7220963083946866
Fail to reject the null hypothesis: There is no significant difference in margins across zip codes.
