In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.stem import SnowballStemmer
import random
plt.style.use("seaborn")
import json

## Data Preparation
1. Get Data from victims.csv and volunteers.csv
2. clean data

In [2]:
# Get Victims Data
snow = SnowballStemmer(language='english')
victims_df = pd.read_csv("victims.csv")
victims_df["verbs"] = [[snow.stem(y) for y in x.split('-')] for x in victims_df["verbs"]]
victims_df["priorities"] = [[ int(y) for y in x.split('-') ] for x in victims_df["priorities"]]
print(victims_df.head())
#print("\n\n")

# Get Volunteers Data
volunteers_df = pd.read_csv("volunteers.csv")
volunteers_df["verbs"] = [[snow.stem(y) for y in x.split('-')] for x in volunteers_df["verbs"]]
volunteers_df_copy = pd.read_csv("volunteers.csv")
volunteers_df_copy["verbs"] = [x.split('-') for x in volunteers_df_copy["verbs"]]
volunteers_df_copy.drop(columns=["can_serve"])
print(volunteers_df.head())

   id    help_needed                                 verbs       priorities
0  S1  Psychological  [medicin, drive, sing, danc, comedi]  [9, 2, 4, 4, 5]
1  S2           Food              [drive, farm, distribut]        [5, 9, 8]
2  S3  Transportaion                               [drive]              [6]
3  S4       Teaching            [teach, art, danc, scienc]     [9, 8, 7, 6]
   id Volunteer               verbs  can_serve
0  V1    Singer              [sing]          2
1  V2    Artist         [act, sing]          2
2  V3    Driver  [drive, distribut]          2
3  V4    Doctor           [medicin]          2
4  V5    Farmer              [farm]          2


In [3]:
#print(victims_df.iloc[1].values)
results = dict()
for i in range(victims_df.shape[0]):
    row = victims_df.iloc[i].values
    result = {}
    idx = row[0]
    result['help'] = row[1]
    result['verbs'] = row[2]
    result['volunteers_allocated'] = []
    results[idx] = result
    
#results
#results['S1']['volunteers_allocated'].append('V1')


In [4]:
k= victims_df.shape[0]
colors = ['green','red','blue','yellow','orange']
clusters = {}
for i in range(k):
    victim = victims_df.iloc[i].to_dict() # they will be victims
    points = []
    cluster = {
        'victim': victim,
        'points': points,
        'color': colors[i%len(colors)]
    }
    clusters[i] = cluster

In [5]:
def get_common_elements(v1,v2):
    return list(set(v1) & set(v2))

def remove_common_elements(a,b):
    return list(set(a) - set(b))

def get_distance(common, arr= None, priorities = None):
    if priorities is None or arr is None:
        return len(common)
    else:
        distance = np.sum([priorities[ix] for ix, val in enumerate(arr) if val in common])
        return distance

In [6]:
#distance(volunteers_df["verbs"].iloc[2],victims_df["verbs"].iloc[1])
'''
victim = volunteers_df.iloc[0].to_dict()
print(victim)
print(clusters)

cur_x = volunteers_df.iloc[1].to_dict()
print(cur_x)
print(clusters[2]['victim']['verbs'])
distance(cur_x['verbs'],clusters[2]['victim']['verbs'])

common = [1,2,3,4,5]
arr = [1,2,3,4,5,6,7,8,9,10]
priorities = [5,4,7,9,2,1,5,4,2,7]
print(get_distance(common))
'''

"\nvictim = volunteers_df.iloc[0].to_dict()\nprint(victim)\nprint(clusters)\n\ncur_x = volunteers_df.iloc[1].to_dict()\nprint(cur_x)\nprint(clusters[2]['victim']['verbs'])\ndistance(cur_x['verbs'],clusters[2]['victim']['verbs'])\n\ncommon = [1,2,3,4,5]\narr = [1,2,3,4,5,6,7,8,9,10]\npriorities = [5,4,7,9,2,1,5,4,2,7]\nprint(get_distance(common))\n"

In [7]:
# E-step
# Parameters : X = dataset -> pd.DataFrame
#            : clusters = victimslist -> dictionary 
def AssignPointsToClusters(Volunteers, clusters):
    done = True
    for ix in range(Volunteers.shape[0]):
        dist = []
        common_elements_list = []
        cur_vol = Volunteers.iloc[ix].to_dict()
        for kx in range(k):
            common_elements = get_common_elements(cur_vol['verbs'],clusters[kx]['victim']['verbs'])
            common_elements_list.append(common_elements)
            # Get Distance score based on Priorities
            distance = get_distance(common_elements, clusters[kx]['victim']['verbs'], clusters[kx]['victim']['priorities'])
            dist.append(distance)
        
        maximum = np.max(dist)
        max_list = [idx for idx,val in enumerate(dist) if val == maximum]
        # Randomly choose one victim to be assigned if multiple victims have same distance score
        cur_cluster = random.choice(max_list)
        print("dist:", dist, "max:", maximum, "argmax(selected):", cur_cluster)
        if maximum != 0:
            clusters[cur_cluster]['points'].append(cur_vol)
            # Remove common Elements from Victims so that further Volunteers are not assigned
            common_elements = common_elements_list[cur_cluster]
            clusters[cur_cluster]['victim']['verbs'] = remove_common_elements(clusters[cur_cluster]['victim']['verbs'],common_elements)    
            if cur_vol['can_serve'] == 1:
                cur_vol['verbs'] = remove_common_elements(cur_vol['verbs'],common_elements)
                volunteers_df['verbs'].iloc[ix] = cur_vol['verbs']
            else:
                volunteers_df['can_serve'].iloc[ix] = cur_vol['can_serve'] - 1
            done = False
    return done
        

# M-step
def UpdateClusters(clusters,k):
    for kx in range(k):
        pts = np.array(clusters[kx]['points'])
        if pts.shape[0]>0:
            for ix in range(pts.shape[0]):
                idx = clusters[kx]['victim']['id']
                verbs_list = volunteers_df_copy['verbs'].loc[volunteers_df_copy["id"] == pts[ix]['id']].to_list()
                pts[ix]['verbs'] = verbs_list
                
                results[idx]['volunteers_allocated'].append(pts[ix])
                print("Volunteer:", pts[ix]["id"], "is assigned to Victim:", idx)
            clusters[kx]['points'] = [] #Clear the List

def model(X,clusters,k):
    done = False
    while not done:
        done = AssignPointsToClusters(X,clusters)
        UpdateClusters(clusters,k)
    
    

In [8]:
model(volunteers_df,clusters, k)
results

dist: [4, 0.0, 0.0, 0.0] max: 4.0 argmax(selected): 0
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
dist: [4, 13, 6, 0.0] max: 13.0 argmax(selected): 1
dist: [4, 0.0, 0.0, 0.0] max: 4.0 argmax(selected): 0
dist: [0.0, 5, 0.0, 0.0] max: 5.0 argmax(selected): 1
Volunteer: V1 is assigned to Victim: S1
Volunteer: V4 is assigned to Victim: S1
Volunteer: V3 is assigned to Victim: S2
Volunteer: V5 is assigned to Victim: S2
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 1
dist: [2, 0.0, 6, 0.0] max: 6.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 0
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 0
Volunteer: V3 is assigned to Victim: S3
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 3
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 3
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 1
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0] max

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


{'S1': {'help': 'Psychological',
  'verbs': ['medicin', 'drive', 'sing', 'danc', 'comedi'],
  'volunteers_allocated': [{'id': 'V1',
    'Volunteer': 'Singer',
    'verbs': [['singing']],
    'can_serve': 2},
   {'id': 'V4',
    'Volunteer': 'Doctor',
    'verbs': [['medicine']],
    'can_serve': 2}]},
 'S2': {'help': 'Food',
  'verbs': ['drive', 'farm', 'distribut'],
  'volunteers_allocated': [{'id': 'V3',
    'Volunteer': 'Driver',
    'verbs': [['driving', 'distributing']],
    'can_serve': 2},
   {'id': 'V5',
    'Volunteer': 'Farmer',
    'verbs': [['farming']],
    'can_serve': 2}]},
 'S3': {'help': 'Transportaion',
  'verbs': ['drive'],
  'volunteers_allocated': [{'id': 'V3',
    'Volunteer': 'Driver',
    'verbs': [['driving', 'distributing']],
    'can_serve': 1}]},
 'S4': {'help': 'Teaching',
  'verbs': ['teach', 'art', 'danc', 'scienc'],
  'volunteers_allocated': []}}

In [55]:
json_obj = json.dumps(results, indent=2)
with open('results.json', 'w') as f:
    f.write(json_obj)


In [56]:
print(json_obj)

{
  "S1": {
    "help": "Psychological",
    "verbs": [
      "medicine",
      "driving",
      "singing",
      "dancing",
      "comedy"
    ],
    "volunteers_allocated": [
      {
        "id": "V1",
        "Volunteer": "Singer",
        "verbs": [
          [
            "singing"
          ]
        ]
      },
      {
        "id": "V4",
        "Volunteer": "Doctor",
        "verbs": [
          [
            "medicine"
          ]
        ]
      }
    ]
  },
  "S2": {
    "help": "Food",
    "verbs": [
      "driving",
      "farming",
      "distributing"
    ],
    "volunteers_allocated": [
      {
        "id": "V3",
        "Volunteer": "Driver",
        "verbs": [
          [
            "driving",
            "distributing"
          ]
        ]
      },
      {
        "id": "V5",
        "Volunteer": "Farmer",
        "verbs": [
          [
            "farming"
          ]
        ]
      }
    ]
  },
  "S3": {
    "help": "Transportaion",
    "verbs": [
      "driv