In [97]:
# import
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [98]:
# Load the data
df = pd.read_csv("../data/zodiac_marriage_success.csv", index_col="Unnamed: 0")
df.head()

Unnamed: 0,zodiac1,zodiac2,divorce
0,Leo,Capricorn,0
1,Aquarius,Sagittarius,0
2,Sagittarius,Virgo,0
3,Scorpio,Sagittarius,1
4,Pisces,Virgo,0


In [99]:
# do one-hot encoding to Zod_sign_man and Zod_sign_woman and combine them together into a dataframe as X
X1 = pd.get_dummies(df['zodiac1']).values
X2 = pd.get_dummies(df['zodiac2']).values
X = pd.DataFrame(X1 + X2)

# define y variable 
y = df["divorce"]

# Rename the columns to zodiac sign
adjacency_matrix = pd.crosstab(df.zodiac1, df.zodiac2)
idx = adjacency_matrix.columns.union(adjacency_matrix.index)
X.columns = idx

# Rearranging index to start from 1
X.index = np.arange(1, len(X) + 1)

# avoid the multi-line formatting
pd.set_option('expand_frame_repr', False)

print(X)


       Aquarius  Aries  Cancer  Capricorn  Gemini  Leo  Libra  Pisces  Sagittarius  Scorpio  Taurus  Virgo
1             0      0       0          1       0    1      0       0            0        0       0      0
2             1      0       0          0       0    0      0       0            1        0       0      0
3             0      0       0          0       0    0      0       0            1        0       0      1
4             0      0       0          0       0    0      0       0            1        1       0      0
5             0      0       0          0       0    0      0       1            0        0       0      1
...         ...    ...     ...        ...     ...  ...    ...     ...          ...      ...     ...    ...
24745         0      0       0          0       0    0      0       0            1        0       1      0
24746         0      0       0          0       0    1      0       0            0        1       0      0
24747         0      0       0       

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200)
classifier.fit(X_train, y_train)
predictions_test = classifier.predict(X_test)
accuracy_score(y_test, predictions_test)

0.947317388493859

In [101]:
# Create df with all possible zodiac combos
from itertools import product
zodiacs = ['Aries', 'Taurus', 'Gemini', 'Cancer', 'Leo', 'Virgo', 'Libra', 'Scorpio', 'Sagittarius', 'Capricorn', 'Aquarius', 'Pisces']
zodiacs_df = pd.DataFrame(zodiacs, columns=["zodiac"])
all_zodiacs = pd.DataFrame(product(zodiacs_df['zodiac'], zodiacs_df['zodiac']), columns=["zodiac1","zodiac2"])
all_zodiacs

Unnamed: 0,zodiac1,zodiac2
0,Aries,Aries
1,Aries,Taurus
2,Aries,Gemini
3,Aries,Cancer
4,Aries,Leo
...,...,...
139,Pisces,Scorpio
140,Pisces,Sagittarius
141,Pisces,Capricorn
142,Pisces,Aquarius


In [102]:
X1_pred = pd.get_dummies(all_zodiacs['zodiac1']).values
X2_pred = pd.get_dummies(all_zodiacs['zodiac2']).values
X_pred = pd.DataFrame(X1_pred + X2_pred)

# Rename the columns to zodiac sign
adjacency_matrix_new = pd.crosstab(all_zodiacs["zodiac1"], all_zodiacs["zodiac2"])
idx_new = adjacency_matrix_new.columns.union(adjacency_matrix_new.index)
X_pred.columns = idx_new

# Rearranging index to start from 1
X_pred.index = np.arange(1, len(X_pred) + 1)

# avoid the multi-line formatting
pd.set_option('expand_frame_repr', False)

In [103]:
predictions = classifier.predict(X_pred)

In [106]:
all_zodiacs["bad_match"] = predictions

In [108]:
all_zodiacs["bad_match"].value_counts()

0    144
Name: bad_match, dtype: int64