#### Loading CSV File

In [9]:
import pandas as pd

divorce_df = pd.read_csv('divorce_data.csv', delimiter=';')

# Display the first few rows of the dataframe to understand its structure
print(divorce_df.head())

# Display summary statistics and information
print(divorce_df.info())
print(divorce_df.describe())

   Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  Q10  ...  Q46  Q47  Q48  Q49  Q50  Q51  \
0   2   2   4   1   0   0   0   0   0    0  ...    2    1    3    3    3    2   
1   4   4   4   4   4   0   0   4   4    4  ...    2    2    3    4    4    4   
2   2   2   2   2   1   3   2   1   1    2  ...    3    2    3    1    1    1   
3   3   2   3   2   3   3   3   3   3    3  ...    2    2    3    3    3    3   
4   2   2   1   1   1   1   0   0   0    0  ...    2    1    2    3    2    2   

   Q52  Q53  Q54  Divorce  
0    3    2    1        1  
1    4    2    2        1  
2    2    2    2        1  
3    2    2    2        1  
4    2    1    0        1  

[5 rows x 55 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 55 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Q1       170 non-null    int64
 1   Q2       170 non-null    int64
 2   Q3       170 non-null    int64
 3   Q4       170 non-null    int64
 

#### Loading TSV File

In [10]:
reference_df = pd.read_csv('reference.tsv', delimiter='|')

# Display the first few rows of the dataframe
print(reference_df.head())

# Display summary statistics and information
print(reference_df.info())
print(reference_df.describe())

   atribute_id                                        description
0            1  If one of us apologizes when our discussion de...
1            2  I know we can ignore our differences, even if ...
2            3  When we need it, we can take our discussions w...
3            4  When I discuss with my spouse, to contact him ...
4            5   The time I spent with my wife is special for us.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   atribute_id  54 non-null     int64 
 1   description  54 non-null     object
dtypes: int64(1), object(1)
memory usage: 996.0+ bytes
None
       atribute_id
count    54.000000
mean     27.500000
std      15.732133
min       1.000000
25%      14.250000
50%      27.500000
75%      40.750000
max      54.000000


#### Data Merging and Cleanup

In [12]:
# Correctly set up the column names, ensuring that 'Divorce' is correctly assigned
divorce_df.columns = [
    'Q' + str(i + 1) if i < 54 else 'Divorce' for i in range(55)]

# Rename columns using the dictionary, excluding 'Divorce' from integer conversion
divorce_df.rename(columns=lambda x: questions.get( # type: ignore
    int(x[1:]), x) if 'Q' in x else x, inplace=True)

# Verify the renaming and the structure of the DataFrame
print(divorce_df.head())

   If one of us apologizes when our discussion deteriorates, the discussion ends.  \
0                                                  2                                
1                                                  4                                
2                                                  2                                
3                                                  3                                
4                                                  2                                

   I know we can ignore our differences, even if things get hard sometimes.  \
0                                                  2                          
1                                                  4                          
2                                                  2                          
3                                                  2                          
4                                                  2                          

   When we nee

#### Exploratory Data Analysis (EDA)

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

# Histogram of responses for a specific question
plt.figure(figsize=(10, 6))
sns.countplot(
    x='The time I spent with my wife is special for us.', data=divorce_df)
plt.title('Distribution of Responses for Time Spent Together')
plt.show()

# Overall correlation of questions with divorce outcome
correlation_matrix = divorce_df.corr()
divorce_correlations = correlation_matrix['Divorce'].sort_values()
print("Correlations with Divorce:\n", divorce_correlations)

# Heatmap of correlations
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, linewidths=.5)
plt.title('Heatmap of Correlation Matrix')
plt.show()

ModuleNotFoundError: No module named 'seaborn'