In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Convert .txt file to .csv
# Load the .txt file (pipe-delimited)
txt_file_path = 'your_data.txt'
df = pd.read_csv(txt_file_path, delimiter='|')

# Save the DataFrame to a CSV file
csv_file_path = 'converted_data.csv'
df.to_csv(csv_file_path, index=False)
print(f"Data successfully converted from .txt to {csv_file_path}")

# Load the newly created CSV file for further analysis
df = pd.read_csv(csv_file_path)

### Step 2: Data Summarization

# Descriptive statistics for numerical features
numerical_columns = ['TotalPremium', 'TotalClaims', 'SumInsured', 'CalculatedPremiumPerTerm']
print("Descriptive Statistics for Numerical Features:")
print(df[numerical_columns].describe())

# Check data types of columns
print("\nData Types of Columns:")
print(df.dtypes)

### Step 3: Data Quality Assessment

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values)

### Step 4: Univariate Analysis

# Plot histograms for numerical columns
for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[column], bins=30, kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Bar charts for categorical columns
categorical_columns = ['Gender', 'MaritalStatus']
for column in categorical_columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=column, data=df)
    plt.title(f'Count of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()

### Step 5: Bivariate or Multivariate Analysis

# Scatter plot for TotalPremium vs TotalClaims by PostalCode
plt.figure(figsize=(10, 6))
sns.scatterplot(x='TotalPremium', y='TotalClaims', hue='PostalCode', data=df)
plt.title('TotalPremium vs TotalClaims by PostalCode')
plt.xlabel('TotalPremium')
plt.ylabel('TotalClaims')
plt.show()

# Correlation matrix for numerical variables
correlation_matrix = df[numerical_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

### Step 6: Data Comparison (Trends Over Geography)

# Group by Province and analyze trends in CoverType, CalculatedPremiumPerTerm, and Make
grouped_data = df.groupby('Province')[['CoverType', 'CalculatedPremiumPerTerm', 'Make']].agg(lambda x: x.value_counts().index[0])
print("\nTrends Over Geography (Province):")
print(grouped_data)

### Step 7: Outlier Detection

# Box plots to detect outliers in numerical columns
for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()

### Step 8: Visualization (Creative Plots)

# 1. Heatmap of correlations between numerical columns
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='YlGnBu')
plt.title('Heatmap of Correlations')
plt.show()

# 2. Distribution of TotalPremium segmented by MaritalStatus
plt.figure(figsize=(10, 6))
sns.boxplot(x='MaritalStatus', y='TotalPremium', data=df)
plt.title('Distribution of TotalPremium by MaritalStatus')
plt.xlabel('MaritalStatus')
plt.ylabel('TotalPremium')
plt.show()

# 3. Pairplot showing relationships between numerical columns
sns.pairplot(df[numerical_columns])
plt.show()
