In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
wefarm = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/wefarm.parquet")

In [4]:
wefarm["response_sent_dt"] = pd.to_datetime(wefarm["response_sent"], format="ISO8601")
wefarm["question_set_dt"] = pd.to_datetime(wefarm["question_sent"], format="ISO8601")

In [5]:

def parse_datetime_col(df, colstr):
  df[f'{colstr}_year'] = df[colstr].dt.year
  df[f'{colstr}_month'] = df[colstr].dt.month
  df[f'{colstr}_day'] = df[colstr].dt.day
  df[f'{colstr}_hour'] = df[colstr].dt.hour
  df[f'{colstr}_minute'] = df[colstr].dt.minute
  df[f'{colstr}_second'] = df[colstr].dt.second
  df[f'{colstr}_day_name'] = df[colstr].dt.day_name()
  df[f'{colstr}_day_of_week'] = df[colstr].dt.day_of_week
  df[f'{colstr}_month_name'] = df[colstr].dt.month_name()
  df[f'{colstr}_quarter'] = df[colstr].dt.quarter
  df[f'{colstr}_week'] = df[colstr].dt.isocalendar().week
  df[f'{colstr}_date'] = df[colstr].dt.date
  df[f'{colstr}_time'] = df[colstr].dt.time

  return df

In [6]:
wefarm = parse_datetime_col(wefarm, "response_sent_dt")
wefarm = parse_datetime_col(wefarm, "question_set_dt")

In [7]:
niche = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/en_questions_cat_niche.parquet")
broad = pd.read_parquet("/content/drive/MyDrive/DataKit_WeFarm/en_questions_cat_broad.parquet")

In [8]:
all = pd.merge(wefarm, niche, on="question_id").merge(
  broad, on="question_id"
)

In [9]:
import json

with open("/content/drive/MyDrive/DataKit_WeFarm/segment_user_ids.json", "r") as r:
    segment_user_id_dct = json.load(r)

In [10]:
segment_user_id_dct.keys()

dict_keys(['user_activity_post_count', 'speed_post_response', 'unique_askers', 'tenure'])

In [11]:
# segment_user_id_dct["user_activity_post_count"].keys()

In [12]:
from collections import defaultdict

user_segments = defaultdict(dict)
for agg, dct in segment_user_id_dct.items():
  for segment, user_ids in dct.items():
    for user_id in user_ids:
      user_segments[user_id][agg] = segment

row_lst = []
for user_id, segments in user_segments.items():
  row = {"user_id": user_id}
  row.update(segments)
  row_lst.append(row)

user_segments_df = pd.DataFrame(row_lst)

In [13]:
all1 = all.merge(user_segments_df, left_on="question_user_id", right_on="user_id", how="inner")

In [14]:
all2 = all1[all1["question_user_country_code"].isin(["ug", "ke"])]

In [15]:
all2["question_user_country_code"] = [j.upper() for j in all2["question_user_country_code"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all2["question_user_country_code"] = [j.upper() for j in all2["question_user_country_code"]]


In [16]:
all3 = all2.drop_duplicates("question_id")

In [17]:
# def intersection_mask(ua_subsegment_lst, spr_subsegment_lst, cs_subsegment_lst, ten_subsegment_lst):

#   mk = pd.Series(True, index=all3.index)

#   if ua_subsegment_lst:
#     mk &= all3["user_activity_post_count"].isin(ua_subsegment_lst)
#   if spr_subsegment_lst:
#     mk &= all3["speed_post_response"].isin(spr_subsegment_lst)
#   if cs_subsegment_lst:
#     mk &= all3["conversation_starters"].isin(cs_subsegment_lst)
#   if ten_subsegment_lst:
#     mk &= all3["tenure"].isin(ten_subsegment_lst)

#   big_mk = all3.loc[mk].index

#   return len(big_mk)

In [18]:
# intersection_mask(["5", "4"], ["5", "1", "2", "3"], ["5"], ["5"])

In [19]:
segment_name_lst = [
  "user_activity_post_count",
  "speed_post_response",
  "unique_askers",
  "tenure"
]

subsegment_lst = [
  None,
  "1",
  "2",
  "3",
  "4",
  "5",
]

In [20]:
from itertools import product

def index_combos(segment_name_lst, subsegment_lst):
  combo_dcts_lst = []
  for combo in product(subsegment_lst, repeat=4):
    combo_dct = {}
    for idx, segment in enumerate(segment_name_lst):
      if combo[idx] is None:
        combo_dct[segment] = ["1", "2", "3", "4", "5"]
      else:
        combo_dct[segment] = [combo[idx]]
    combo_dcts_lst.append(combo_dct)

  return combo_dcts_lst

In [21]:
def chunk(total_combos, chunk_size):
  chunks = []
  for j in range(0, total_combos, chunk_size):
    chunks.append((j, min(j+chunk_size, total_combos)))
  return chunks

In [22]:
from tqdm import tqdm
import pickle
import json

def find_response_ids_from_combos_chunk(df, segment_name_lst, subsegment_lst, combos_dcts_lst, chunks, chunk_idx):

  # chunk start and end indicies
  start_idx, end_idx = chunks[chunk_idx]

  # find intersections
  intersection_dct = {}
  for combo_idx in range(start_idx, end_idx):
    combo = combos_dcts_lst[combo_idx]

    mk = pd.Series(True, index=df.index)
    for segment in segment_name_lst:
      mk &= df[segment].isin(combo[segment])

    # retrieve user_ids
    idc = df.loc[mk].index.tolist()

    # save as json
    key = json.dumps(combo, sort_keys=True)
    intersection_dct[key] = idc

  # filename=f"chunk_{chunk_idx}.pkl"
  # with open(filename, "wb") as wb:
  #   pickle.dump(intersection_dct, wb)
  # !cp {filename} /content/drive/MyDrive/DataKit_WeFarm/Dashboard/intersections

  return intersection_dct

In [23]:
combos_dcts_lst = index_combos(segment_name_lst, subsegment_lst)
chunk_lst = chunk(len(combos_dcts_lst), 10)

In [24]:
intersection_dct = {}
for chunk_idx in tqdm(range(len(chunk_lst))):
  intersection_dct.update(find_response_ids_from_combos_chunk(all3, segment_name_lst, subsegment_lst, combos_dcts_lst, chunk_lst, chunk_idx))

100%|██████████| 130/130 [03:40<00:00,  1.70s/it]


In [25]:
list(intersection_dct.keys())[0]

'{"speed_post_response": ["1", "2", "3", "4", "5"], "tenure": ["1", "2", "3", "4", "5"], "unique_askers": ["1", "2", "3", "4", "5"], "user_activity_post_count": ["1", "2", "3", "4", "5"]}'

In [26]:
filename=f"segmentation_intersections_all.pkl"
with open(filename, "wb") as wb:
  pickle.dump(intersection_dct, wb)
!cp {filename} /content/drive/MyDrive/DataKit_WeFarm/Dashboard/intersections

# debugging

In [27]:
# import pickle

# filename="/content/drive/MyDrive/DataKit_WeFarm/Dashboard/intersections/segmentation_intersections_all.pkl"
# with open(filename, "rb") as rb:
#   intersection_dct = pickle.load(rb)

In [28]:
intersection_dct.keys()

dict_keys(['{"speed_post_response": ["1", "2", "3", "4", "5"], "tenure": ["1", "2", "3", "4", "5"], "unique_askers": ["1", "2", "3", "4", "5"], "user_activity_post_count": ["1", "2", "3", "4", "5"]}', '{"speed_post_response": ["1", "2", "3", "4", "5"], "tenure": ["1"], "unique_askers": ["1", "2", "3", "4", "5"], "user_activity_post_count": ["1", "2", "3", "4", "5"]}', '{"speed_post_response": ["1", "2", "3", "4", "5"], "tenure": ["2"], "unique_askers": ["1", "2", "3", "4", "5"], "user_activity_post_count": ["1", "2", "3", "4", "5"]}', '{"speed_post_response": ["1", "2", "3", "4", "5"], "tenure": ["3"], "unique_askers": ["1", "2", "3", "4", "5"], "user_activity_post_count": ["1", "2", "3", "4", "5"]}', '{"speed_post_response": ["1", "2", "3", "4", "5"], "tenure": ["4"], "unique_askers": ["1", "2", "3", "4", "5"], "user_activity_post_count": ["1", "2", "3", "4", "5"]}', '{"speed_post_response": ["1", "2", "3", "4", "5"], "tenure": ["5"], "unique_askers": ["1", "2", "3", "4", "5"], "user_