In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.stem import SnowballStemmer
import random
plt.style.use("seaborn")
import json

## Data Preparation
1. Get Data from victims.csv and volunteers.csv
2. clean data

In [4]:
# Get Victims Data
snow = SnowballStemmer(language='english')
victims_df = pd.read_csv("victims(1).csv")
victims_df["verbs"] = [[snow.stem(y) for y in x.split('-')] for x in victims_df["verbs"]]
victims_df["priorities"] = [[ int(y) for y in x.split('-') ] for x in victims_df["priorities"]]
print(victims_df.head())
print("\n\n")

# Get Volunteers Data
volunteers_df = pd.read_csv("volunteers(1).csv")
volunteers_df["verbs"] = [[snow.stem(y) for y in x.split('-')] for x in volunteers_df["verbs"]]
volunteers_df_copy = pd.read_csv("volunteers.csv")
volunteers_df_copy["verbs"] = [x.split('-') for x in volunteers_df_copy["verbs"]]
volunteers_df_copy.drop(columns=["can_serve"])
print(volunteers_df.head())

   id  help_needed                                           verbs  \
0  S1  Psycologist            [psycotherapi, medicin, teach, sing]   
1  S2       Dancer         [danc, drive, comedi, act, sing, paint]   
2  S3         Chef                 [cook, chariti, comedi, museum]   
3  S4       Docent  [museum, act, sing, paint, farm, provide food]   
4  S5     Traveler          [travel, medicin, write, problem solv]   

            priorities  
0        [10, 1, 1, 2]  
1  [10, 5, 7, 8, 5, 1]  
2        [10, 1, 1, 1]  
3  [10, 1, 7, 8, 1, 1]  
4        [10, 2, 3, 1]  



   id    volunteer                                     verbs  can_serve
0  V1       Singer                         [sing, carpentri]          2
1  V2       Dancer                             [danc, drive]          2
2  V3         Chef                   [cook, chariti, comedi]          2
3  V4  Psycologist  [psycotherapi, cook, sing, public speak]          2
4  V5     Traveler                  [travel, medicin, write]      

In [5]:
results = dict()
for i in range(victims_df.shape[0]):
    row = victims_df.iloc[i].values
    result = {}
    idx = row[0]
    result['help'] = row[1]
    result['verbs'] = row[2]
    result['volunteers_allocated'] = []
    results[idx] = result
    
#results
#results['S1']['volunteers_allocated'].append('V1')


In [6]:
k= victims_df.shape[0]
colors = ['green','red','blue','yellow','orange']
clusters = {}
for i in range(k):
    victim = victims_df.iloc[i].to_dict() # they will be Centroid of the clusters
    points = []
    cluster = {
        'victim': victim,
        'points': points,
        'color': colors[i%len(colors)]
    }
    clusters[i] = cluster

In [7]:
clusters


{0: {'victim': {'id': 'S1',
   'help_needed': 'Psycologist',
   'verbs': ['psycotherapi', 'medicin', 'teach', 'sing'],
   'priorities': [10, 1, 1, 2]},
  'points': [],
  'color': 'green'},
 1: {'victim': {'id': 'S2',
   'help_needed': 'Dancer',
   'verbs': ['danc', 'drive', 'comedi', 'act', 'sing', 'paint'],
   'priorities': [10, 5, 7, 8, 5, 1]},
  'points': [],
  'color': 'red'},
 2: {'victim': {'id': 'S3',
   'help_needed': 'Chef',
   'verbs': ['cook', 'chariti', 'comedi', 'museum'],
   'priorities': [10, 1, 1, 1]},
  'points': [],
  'color': 'blue'},
 3: {'victim': {'id': 'S4',
   'help_needed': 'Docent',
   'verbs': ['museum', 'act', 'sing', 'paint', 'farm', 'provide food'],
   'priorities': [10, 1, 7, 8, 1, 1]},
  'points': [],
  'color': 'yellow'},
 4: {'victim': {'id': 'S5',
   'help_needed': 'Traveler',
   'verbs': ['travel', 'medicin', 'write', 'problem solv'],
   'priorities': [10, 2, 3, 1]},
  'points': [],
  'color': 'orange'},
 5: {'victim': {'id': 'S6',
   'help_needed': 

In [8]:
def get_common_elements(v1,v2):
    return list(set(v1) & set(v2))

def remove_common_elements(a,b):
    return list(set(a) - set(b))

def get_distance(common, arr= None, priorities = None):
    if priorities is None or arr is None:
        return len(common)
    else:
        distance = np.sum([priorities[ix] for ix, val in enumerate(arr) if val in common])
        return distance

In [43]:
#distance(volunteers_df["verbs"].iloc[2],victims_df["verbs"].iloc[1])
'''
victim = volunteers_df.iloc[0].to_dict()
print(victim)
print(clusters)

cur_x = volunteers_df.iloc[1].to_dict()
print(cur_x)
print(clusters[2]['victim']['verbs'])
distance(cur_x['verbs'],clusters[2]['victim']['verbs'])

common = [1,2,3,4,5]
arr = [1,2,3,4,5,6,7,8,9,10]
priorities = [5,4,7,9,2,1,5,4,2,7]
print(get_distance(common))
'''

"\nvictim = volunteers_df.iloc[0].to_dict()\nprint(victim)\nprint(clusters)\n\ncur_x = volunteers_df.iloc[1].to_dict()\nprint(cur_x)\nprint(clusters[2]['victim']['verbs'])\ndistance(cur_x['verbs'],clusters[2]['victim']['verbs'])\n\ncommon = [1,2,3,4,5]\narr = [1,2,3,4,5,6,7,8,9,10]\npriorities = [5,4,7,9,2,1,5,4,2,7]\nprint(get_distance(common))\n"

In [9]:
# E-step
# Parameters : X = dataset -> pd.DataFrame
#            : clusters = victimslist -> dictionary 
def AssignPointsToClusters(Volunteers, clusters):
    done = True
    for ix in range(Volunteers.shape[0]):
        dist = []
        common_elements_list = []
        cur_vol = Volunteers.iloc[ix].to_dict()
        for kx in range(k):
            common_elements = get_common_elements(cur_vol['verbs'],clusters[kx]['victim']['verbs'])
            common_elements_list.append(common_elements)
            # Get Distance score based on Priorities
            distance = get_distance(common_elements, clusters[kx]['victim']['verbs'], clusters[kx]['victim']['priorities'])
            dist.append(distance)
        
        maximum = np.max(dist)
        max_list = [idx for idx,val in enumerate(dist) if val == maximum]
        # Randomly choose one victim to be assigned if multiple victims have same distance score
        cur_cluster = random.choice(max_list)
        print("dist:", dist, "max:", maximum, "argmax(selected):", cur_cluster)
        if maximum != 0:
            clusters[cur_cluster]['points'].append(cur_vol)
            # Remove common Elements from Victims so that further Volunteers are not assigned
            common_elements = common_elements_list[cur_cluster]
            priorities_list = clusters[cur_cluster]['victim']['priorities']
            priorities_list = [x for (i,x) in enumerate(priorities_list) if clusters[cur_cluster]['victim']['verbs'][i] not in common_elements]
            clusters[cur_cluster]['victim']['priorities'] = priorities_list
            clusters[cur_cluster]['victim']['verbs'] = remove_common_elements(clusters[cur_cluster]['victim']['verbs'],common_elements)    
            if cur_vol['can_serve'] == 1:
                cur_vol['verbs'] = remove_common_elements(cur_vol['verbs'],common_elements)
                volunteers_df['verbs'].iloc[ix] = cur_vol['verbs']
            else:
                volunteers_df['can_serve'].iloc[ix] = cur_vol['can_serve'] - 1
            done = False
    return done
        

# M-step
def UpdateClusters(clusters,k):
    for kx in range(k):
        pts = np.array(clusters[kx]['points'])
        if pts.shape[0]>0:
            for ix in range(pts.shape[0]):
                idx = clusters[kx]['victim']['id']
                verbs_list = volunteers_df_copy['verbs'].loc[volunteers_df_copy["id"] == pts[ix]['id']].to_list()
                pts[ix]['verbs'] = verbs_list
                
                results[idx]['volunteers_allocated'].append(pts[ix])
                print("Volunteer:", pts[ix]["id"], "is assigned to Victim:", idx)
            clusters[kx]['points'] = [] #Clear the List

def model(X,clusters,k):
    done = False
    while not done:
        done = AssignPointsToClusters(X,clusters)
        UpdateClusters(clusters,k)
    
    

In [10]:
model(volunteers_df,clusters, k)
results

dist: [2, 5, 0.0, 7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7, 8, 0.0, 0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0, 5, 1, 0.0, 0.0, 0.0, 3, 1] max: 8.0 argmax(selected): 14
dist: [0.0, 15, 0.0, 0.0, 0.0, 3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 0.0, 0.0, 0.0, 10, 5, 0.0, 10, 0.0, 0.0, 5, 5, 0.0, 3, 1, 0.0, 0.0] max: 15.0 argmax(selected): 1
dist: [0.0, 8, 12, 0.0, 0.0, 1, 1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6, 0.0, 15, 0.0, 0.0, 0.0, 0.0, 0.0, 10, 0.0, 0.0, 2, 0.0, 4, 1, 0.0, 10] max: 15.0 argmax(selected): 15
dist: [12, 7, 10, 7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2, 0.0, 1, 0.0, 0.0, 0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0, 5, 2, 5, 5, 0.0, 3, 11] max: 12.0 argmax(selected): 0
dist: [1, 0.0, 0.0, 0.0, 15, 0.0, 10, 10, 1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7, 0.0, 10, 0.0, 0.0, 0.0, 1, 0.0, 0.0, 0.0, 7, 0.0, 0.0, 1] max: 15.0 argmax(selected): 4
dist: [0.0, 0.0, 0.0, 0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] max

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


dist: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 25
dist: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 27
dist: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 18
dist: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 26
dist: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

{'S1': {'help': 'Psycologist',
  'verbs': ['psycotherapi', 'medicin', 'teach', 'sing'],
  'volunteers_allocated': [{'id': 'V4',
    'volunteer': 'Psycologist',
    'verbs': [['medicine']],
    'can_serve': 2},
   {'id': 'V45', 'volunteer': 'Doctor', 'verbs': [], 'can_serve': 2},
   {'id': 'V51', 'volunteer': 'Singer', 'verbs': [], 'can_serve': 2}]},
 'S2': {'help': 'Dancer',
  'verbs': ['danc', 'drive', 'comedi', 'act', 'sing', 'paint'],
  'volunteers_allocated': [{'id': 'V2',
    'volunteer': 'Dancer',
    'verbs': [['acting', 'singing']],
    'can_serve': 2},
   {'id': 'V32', 'volunteer': 'Aid work', 'verbs': [], 'can_serve': 2},
   {'id': 'V48', 'volunteer': 'Painter', 'verbs': [], 'can_serve': 2},
   {'id': 'V18', 'volunteer': 'Artist', 'verbs': [], 'can_serve': 1},
   {'id': 'V22', 'volunteer': 'Comedian', 'verbs': [], 'can_serve': 1}]},
 'S3': {'help': 'Chef',
  'verbs': ['cook', 'chariti', 'comedi', 'museum'],
  'volunteers_allocated': [{'id': 'V36',
    'volunteer': 'Programmer

In [18]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

json_obj = json.dumps(results, indent=2, cls=NpEncoder)
with open('results.json', 'w') as f:
    f.write(json_obj)


In [19]:
print(json_obj)

{
  "S1": {
    "help": "Psycologist",
    "verbs": [
      "psycotherapi",
      "medicin",
      "teach",
      "sing"
    ],
    "volunteers_allocated": [
      {
        "id": "V4",
        "volunteer": "Psycologist",
        "verbs": [
          [
            "medicine"
          ]
        ],
        "can_serve": 2
      },
      {
        "id": "V45",
        "volunteer": "Doctor",
        "verbs": [],
        "can_serve": 2
      },
      {
        "id": "V51",
        "volunteer": "Singer",
        "verbs": [],
        "can_serve": 2
      }
    ]
  },
  "S2": {
    "help": "Dancer",
    "verbs": [
      "danc",
      "drive",
      "comedi",
      "act",
      "sing",
      "paint"
    ],
    "volunteers_allocated": [
      {
        "id": "V2",
        "volunteer": "Dancer",
        "verbs": [
          [
            "acting",
            "singing"
          ]
        ],
        "can_serve": 2
      },
      {
        "id": "V32",
        "volunteer": "Aid work",
        "ver