In [1]:
import pandas as pd
import random
import numpy as np

In [2]:
# Creating a Dataset of men and women
men = pd.DataFrame()

women = pd.DataFrame()

# Number of users
num = 1000

# Dating profile questions for each
qs = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']

# Answers to profile questions
ans = ['A', 'B', 'C', 'D', 'E']

for q in qs:
    
    # Making them categorical for preprocessing later
    men[q] = pd.Categorical(random.choices(ans, k=num), categories=ans)
    
    women[q] = pd.Categorical(random.choices(ans, k=num), categories=ans)
    
    # IDs
    men['id'] = ["m"+str(i) for i in range(num)]
    
    women['id'] = ["w"+str(i) for i in range(num)]
    
# Setting index
men.set_index('id', inplace=True)

women.set_index('id', inplace=True)

# Creating match status between users
ratings = pd.DataFrame(index=men.index, columns=women.index)

for i in ratings.columns:
    ratings[i] = random.choices([0,1,"unseen"], k=num)

In [3]:
men

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
m0,E,A,E,E,C
m1,E,C,A,B,C
m2,C,B,C,D,A
m3,D,A,A,D,B
m4,D,C,E,C,D
...,...,...,...,...,...
m995,E,C,D,A,A
m996,E,D,E,B,B
m997,E,E,E,D,C
m998,A,D,C,A,A


In [4]:
m_user = ratings.T.apply(pd.Series.value_counts).T.sort_values(
    by="unseen", 
    ascending=False
).iloc[0]

In [5]:
m_nrate = ratings.T[ratings.T[m_user.name]=="unseen"].index

In [6]:
n_men = men.apply(lambda x: x.cat.codes)


In [7]:
n_men

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
m0,4,0,4,4,2
m1,4,2,0,1,2
m2,2,1,2,3,0
m3,3,0,0,3,1
m4,3,2,4,2,3
...,...,...,...,...,...
m995,4,2,3,0,0
m996,4,3,4,1,1
m997,4,4,4,3,2
m998,0,3,2,0,0


In [8]:
m_sim = n_men.T.corrwith(
    n_men.T[m_user.name]
).sort_values(
    ascending=False
)[1:11]

In [9]:
m_sim

id
m760    0.987457
m8      0.970725
m980    0.943456
m367    0.943456
m77     0.943456
m193    0.941742
m128    0.941469
m268    0.931552
m146    0.930261
m54     0.930261
dtype: float64

In [10]:
msim_rate = ratings.loc[list(m_sim.index)][m_nrate]


In [11]:
msim_rate

id,w0,w23,w25,w27,w28,w31,w36,w37,w40,w42,...,w971,w972,w979,w980,w981,w984,w986,w987,w996,w998
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m760,0,0,1,unseen,unseen,1,unseen,1,unseen,unseen,...,0,unseen,unseen,1,1,1,1,unseen,0,1
m8,1,0,1,1,1,1,0,0,1,0,...,unseen,1,0,0,0,0,0,1,unseen,1
m980,1,0,0,1,unseen,1,0,unseen,unseen,0,...,0,0,0,unseen,1,0,1,0,unseen,0
m367,1,1,1,unseen,1,1,unseen,unseen,0,1,...,1,1,0,unseen,unseen,1,1,unseen,1,0
m77,0,1,0,unseen,1,unseen,1,1,1,0,...,0,unseen,1,unseen,1,unseen,0,0,unseen,unseen
m193,unseen,unseen,0,1,0,1,0,0,0,0,...,1,0,0,1,unseen,0,0,unseen,0,1
m128,0,0,0,1,1,0,1,unseen,0,0,...,1,unseen,unseen,1,0,unseen,1,1,0,0
m268,unseen,0,unseen,1,0,0,0,unseen,1,1,...,1,unseen,1,1,0,unseen,0,1,1,unseen
m146,1,unseen,1,0,1,0,unseen,unseen,1,0,...,0,unseen,unseen,1,unseen,1,1,1,0,1
m54,0,1,unseen,0,unseen,0,0,0,0,0,...,1,0,unseen,unseen,0,1,0,unseen,0,1


In [12]:
# Man predictions
m_predict = pd.DataFrame()
# Replacing the unseen values with NaNs for calculation purposes
msim_rate.replace(
    "unseen", 
    np.nan, 
    inplace=True
)
# Average
m_predict['avg'] = msim_rate.mean()
# Frequency
m_predict['freq'] = msim_rate.mode().T[0]
# Median
m_predict['median'] = msim_rate.median()

In [13]:
m_predict

Unnamed: 0_level_0,avg,freq,median
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
w0,0.500000,0.0,0.5
w23,0.375000,0.0,0.0
w25,0.500000,0.0,0.5
w27,0.714286,1.0,1.0
w28,0.714286,1.0,1.0
...,...,...,...
w984,0.571429,1.0,1.0
w986,0.500000,0.0,0.5
w987,0.666667,1.0,1.0
w996,0.285714,0.0,0.0


In [15]:
m_predict['avg']

id
w0      0.500000
w23     0.375000
w25     0.500000
w27     0.714286
w28     0.714286
          ...   
w984    0.571429
w986    0.500000
w987    0.666667
w996    0.285714
w998    0.625000
Name: avg, Length: 380, dtype: float64

In [17]:
sort(m_predict['avg'])

NameError: name 'sort' is not defined

In [18]:
# creating a pandas dataframe
df = pd.DataFrame(m_predict['avg'])
 

In [19]:
df

Unnamed: 0_level_0,avg
id,Unnamed: 1_level_1
w0,0.500000
w23,0.375000
w25,0.500000
w27,0.714286
w28,0.714286
...,...
w984,0.571429
w986,0.500000
w987,0.666667
w996,0.285714


In [20]:
sort_values(df)

NameError: name 'sort_values' is not defined

In [21]:
df.sort_values

<bound method DataFrame.sort_values of            avg
id            
w0    0.500000
w23   0.375000
w25   0.500000
w27   0.714286
w28   0.714286
...        ...
w984  0.571429
w986  0.500000
w987  0.666667
w996  0.285714
w998  0.625000

[380 rows x 1 columns]>

In [24]:

# Sorting by column "Population"
df.sort_values(by=['avg'], ascending=False).head(10)

Unnamed: 0_level_0,avg
id,Unnamed: 1_level_1
w925,1.0
w733,1.0
w347,1.0
w395,1.0
w606,1.0
w123,0.888889
w453,0.875
w629,0.875
w513,0.875
w149,0.857143
