## ARK

In [7]:
__author__ = 'Olatunji Ogunbiyi'
__version__ = "3.0"
__license__ = "MIT"

In [8]:
import os, sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse
pd.set_option('display.max_columns', None)

In [3]:
class DataManager:
    
    def __init__(self,responses_filename):
        self.response_df = pd.read_csv(responses_filename)
        self.response_df.set_index('Bio: Name',inplace=True)
        del self.response_df.index.name
        timestamps = self.response_df['Timestamp']
        self.response_df.drop(columns=['Timestamp'],inplace=True)
        
    def sort_responses(self, df):
        bio_questions = []
        pref_questions = []
        int_questions = []
        hab_questions = []
        pers_questions = []
        flat_questions = []
        extra_questions = []

        for column in df.columns:
            if "Bio" in column:
                bio_questions.append(column)
            elif "Preferences" in column:
                pref_questions.append(column)
            elif "Interests" in column:
                int_questions.append(column)
            elif "Habits" in column:
                hab_questions.append(column)
            elif "Personality" in column:
                pers_questions.append(column)
            elif "Flat" in column:
                flat_questions.append(column)
            elif "Extra" in column:
                extra_questions.append(column)
            else:
                print(column)

        bio_df = df[bio_questions]
        pref_df = df[pref_questions]
        int_df = df[int_questions]
        habit_df = df[hab_questions]
        pers_df = df[pers_questions]
        flat_df = df[flat_questions]
        extra_df = df[extra_questions]

        return bio_df, pref_df, int_df, habit_df, pers_df, flat_df, extra_df
    
    def bio_score(self, bio_df):
        
        bio_filters_df = bio_df[['Bio: Gender', 
                         'Bio: Age', 
                         'Bio: Sexuality', 
                         'Bio: Birthplace',
                         'Bio: Diet', 
                         'Bio: Ethnicity',
                         'Bio: Political views', 
                         'Bio: Highest level of education',
                         'Bio: Employment status']]

        bio_score_df = bio_df[['Bio: How often do you drink?', 
                       'Bio: How often do you smoke?',
                       'Bio: Do you smoke/take weed?',
                       'Bio: How often do you partake in any other recreational substances?']]
        
        bio_score_cos_matrix = pd.DataFrame(cosine_similarity(bio_score_df),
             index=bio_score_df.index,
             columns=bio_score_df.index)
        
        return bio_score_cos_matrix, bio_filters_df
    
    def pref_score(self, pref_df, bio_filters_df):
        
        pref_df["dummy"] = 0
        pref_df = pref_df.reset_index().rename(columns={"index":"pref_name"})
        bio_filters_df["dummy"] = 0
        bio_filters_df = bio_filters_df.reset_index().rename(columns={"index":"bio_name"})
        
        merged_df = pd.merge(pref_df,bio_filters_df,on="dummy",how="outer")
        merged_df["Preferences: Minimum age of flatmate"] = merged_df["Preferences: Minimum age of flatmate"].fillna(18)
        merged_df["Preferences: Maximum age of flatmate"] = merged_df["Preferences: Maximum age of flatmate"].fillna(100)
        
        merged_df = merged_df[merged_df["pref_name"] != merged_df["bio_name"]]
        merged_df["match"] = 0
        
        def pref_filter(row):
            if row["Bio: Age"] >= row["Preferences: Minimum age of flatmate"] and\
            row["Bio: Age"] <= row["Preferences: Maximum age of flatmate"]:
                if row["Preferences: Sex of flatmate"] == "Open to all":
                    return 1
                elif row["Bio: Gender"] in row["Preferences: Sex of flatmate"]:
                    return 1
                else:
                    return 0
            else:
                return 0
        
        merged_df["match"] = merged_df.apply(lambda row: pref_filter(row), axis=1)
        merged_df = merged_df[["pref_name","bio_name","match"]]
        
        merged_df_double = pd.merge(merged_df,
                            merged_df,
                            left_on=["pref_name","bio_name"],
                            right_on=["bio_name","pref_name"],
                            how="inner")
        
        merged_df_double = merged_df_double[merged_df_double["pref_name_x"] != merged_df_double["bio_name_x"]]
        
        merged_df_double = merged_df_double[(merged_df_double["match_x"]==1)&(merged_df_double["match_y"]==1)]
        merged_df_double = merged_df_double[["pref_name_x","pref_name_y"]]
        
        pref_matrix = merged_df_double[["pref_name_x","pref_name_y"]]
        
        return pref_matrix
    
    def int_score(self, int_df):
        
        interest_score_cos_matrix = pd.DataFrame(cosine_similarity(int_df),
                                      index=int_df.index,
                                      columns=int_df.index)
        
        return interest_score_cos_matrix
    
    def habit_score(self, habit_df):
        
        # One hot encodes string variables

        # groups columns by dtype and puts in dict
        habit_df_dtype_dict = habit_df.columns.to_series().groupby(habit_df.dtypes).groups

        # selects list of text columns
        habit_df_text_columns = list(habit_df_dtype_dict[np.dtype('O')])

        # creates df from one hot encoded text columns
        habit_object_df = pd.get_dummies(habit_df[habit_df_text_columns])

        # drops old text columns
        habit_df.drop(columns=habit_df_text_columns,inplace=True)

        # concats new one hot encoded columns
        habit_df = pd.concat([habit_df,habit_object_df],axis=1)
    
        habit_score_cos_matrix = pd.DataFrame(cosine_similarity(habit_df),
                                          index=habit_df.index,
                                          columns=habit_df.index)
        
        return habit_score_cos_matrix
    
    def pers_score(self, pers_df):
        
        personality_score_cos_matrix = pd.DataFrame(cosine_similarity(pers_df),
             index=pers_df.index,
             columns=pers_df.index)
        
        return personality_score_cos_matrix

    def flat_score(self, flat_df):

        flat_df["Flat: Where are you looking to live in London? (Tick all that apply)"] = \
        flat_df["Flat: Where are you looking to live in London? (Tick all that apply)"]\
        .str.split(',')

        flat_df["Flat: What price range are you looking for? Tick all that apply"] = \
        flat_df["Flat: What price range are you looking for? Tick all that apply"]\
        .str.split(',')
        
        flat_df["Flat: Where are you looking to live in London? (Tick all that apply)"] = \
        flat_df["Flat: Where are you looking to live in London? (Tick all that apply)"].\
        apply(lambda x: [zone.lstrip(' ') for zone in x])
        
        flat_df["Flat: What price range are you looking for? Tick all that apply"] = \
        flat_df["Flat: What price range are you looking for? Tick all that apply"].\
        apply(lambda x: [price_range.lstrip(' ') for price_range in x])
        
        def one_hot_encode_list(df,column_name):
            mlb = MultiLabelBinarizer()
            df = df.join(pd.DataFrame(mlb.fit_transform(df.pop(column_name)),
                                  columns=mlb.classes_,
                                  index=df.index))
            return df
        
        flat_df = one_hot_encode_list(flat_df,
                              "Flat: Where are you looking to live in London? (Tick all that apply)")

        flat_df = one_hot_encode_list(flat_df,
                              "Flat: What price range are you looking for? Tick all that apply")
        
        flat_df.columns = [column.strip() for column in flat_df.columns]
        
        return flat_df
        
    def run_data_manager(self):
        
        bio_df, pref_df, int_df, habit_df, pers_df, flat_df, extra_df = self.sort_responses(self.response_df)
        
        bio_score_cos_matrix, bio_filters_df = self.bio_score(bio_df)
        pref_matrix = self.pref_score(pref_df, bio_filters_df)
        interest_score_cos_matrix = self. int_score(int_df)
        habit_score_cos_matrix = self.habit_score(habit_df)
        personality_score_cos_matrix = self.pers_score(pers_df)
        flat_df = self.flat_score(flat_df)
        
        score_dict = {"user_info":bio_filters_df,
             "bio_score_cos_matrix":bio_score_cos_matrix,
             "pref_matrix":pref_matrix,
             "interest_score_cos_matrix":interest_score_cos_matrix,
             "habit_score_cos_matrix":habit_score_cos_matrix,
             "personality_score_cos_matrix":personality_score_cos_matrix,
             "flat_info":flat_df}
        
        return score_dict
        

In [8]:
run_through = DataManager(responses_filename='ark_responses_2.csv')
score_dict = run_through.run_data_manager()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a 

In [18]:
class UserApp:
    
    def __init__(self,name,score_dict):
        self.name = name
        self.user_info = score_dict["user_info"].loc[score_dict["user_info"].index == name]
        self.bio_score_cos_matrix = score_dict["bio_score_cos_matrix"][[self.name]]
        self.pref_matrix = score_dict["pref_matrix"][score_dict["pref_matrix"]["pref_name_x"] == self.name]
        self.interest_score_cos_matrix = score_dict["interest_score_cos_matrix"][[self.name]]
        self.habit_score_cos_matrix = score_dict["habit_score_cos_matrix"][[self.name]]
        self.personality_score_cos_matrix = score_dict["personality_score_cos_matrix"][[self.name]]
        self.flat_info = score_dict["flat_info"]
    
    def flat_df_match(self,flat_df,names_tuple):
    
        zone_filter_columns = ["Zone 1",
                         "Zone 2",
                         "Zone 3",
                         "Zone 4",
                         "Zone 5",
                         "Zone 6",
                         ]

        price_filter_columns = [
                         "0-500",
                         "500-599",
                         "600-699",
                         "700-799",
                         "800-899",
                         "900-999",
                         "1000+"]

        temp_df = flat_df.loc[names_tuple]

        zone_list = []
        for zone in zone_filter_columns:
            if zone not in temp_df.columns:
                pass
            else:
                if temp_df.loc\
                [[names_tuple[0]],zone].item() == temp_df.loc[[names_tuple[1]],zone].item():
                    zone_list.append(zone)
                else:
                    pass

        price_list = []
        for price in price_filter_columns:
            if price not in temp_df.columns:
                pass
            else:
                if temp_df.loc\
                [[names_tuple[0]],price].item() == temp_df.loc[[names_tuple[1]],price].item():
                    price_list.append(price)
                else:
                    pass

        result_dict = {"zones":zone_list,
                      "prices":price_list}

        return result_dict

    def combine_scores(self):
        bio_score_cos_matrix = self.bio_score_cos_matrix
        pref_matrix = self.pref_matrix
        interest_score_cos_matrix = self.interest_score_cos_matrix
        habit_score_cos_matrix = self.habit_score_cos_matrix
        personality_score_cos_matrix = self.personality_score_cos_matrix
        
        bio_score_cos_matrix["score_type"] = "bio"
        interest_score_cos_matrix["score_type"] = "interest"
        habit_score_cos_matrix["score_type"] = "habit"
        personality_score_cos_matrix["score_type"] = "personality"
        
        combo_score = pd.concat([bio_score_cos_matrix,
                                interest_score_cos_matrix,
                                habit_score_cos_matrix,
                                personality_score_cos_matrix],
                               axis=0)
        
        combo_score = combo_score.reset_index().rename(columns={"index":"match",
                                                                self.name:"score"})
        pref_matrix_list = list(pref_matrix["pref_name_y"])
        combo_score = combo_score[combo_score["match"].isin(pref_matrix_list)]
        combo_score = combo_score[combo_score["match"] != self.name].reset_index(drop=True)
        combo_score_average = combo_score.groupby("match").score.mean().reset_index()
        combo_score_average["score_type"] = "average"
        combo_score = pd.concat([combo_score_average,combo_score])
        
        combo_score_average["viable"] = 1
        
        return combo_score_average
    
    def flat_viability(self):
        combo_df = self.combine_scores()
        combo_df["zones"] = np.nan
        combo_df["prices"] = np.nan
        
        flat_info = self.flat_info        
        matches = list(flat_info.index)
        matches.remove(self.name)
        for match in matches:
            flat_match_dict = self.flat_df_match(flat_info,[self.name,match])
            zones = flat_match_dict["zones"]
            prices = flat_match_dict["prices"]
            combo_df.loc[combo_df["match"]==match,"zones"] = {"zones":zones}
            combo_df.loc[combo_df["match"]==match,"prices"] = {"prices":prices}
            
            if len(zones) == 0 or len(prices) == 0:
                combo_df.loc[combo_df["match"]==match,"viable"] = 0
        
        combo_df = combo_df[combo_df["viable"] == 1]
        combo_df.sort_values(by=["score"],ascending=False,inplace=True) 
        return combo_df.reset_index(drop=True)

In [19]:
app1 = UserApp(name = "Jane Doe", score_dict=score_dict)

In [20]:
test_df = app1.flat_viability()
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,match,score,score_type,viable,zones,prices
0,Marie Kondo,0.793366,average,1,"{'zones': ['Zone 1', 'Zone 6']}","{'prices': ['0-500', '500-599', '600-699']}"


In [1]:
from datetime import datetime

In [6]:
datetime.today().strftime("%m/%d/%Y")

'07/25/2020'