In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.stem import SnowballStemmer
import random
plt.style.use("seaborn")
import json

## Data Preparation
1. Get Data from victims.csv and volunteers.csv
2. clean data

In [18]:
# Get Victims Data
snow = SnowballStemmer(language='english')
victim_file = "victims_sample_v2.csv"
victims_df = pd.read_csv(victim_file)
victims_df["skills"] = [[snow.stem(y) for y in x.split('-')] for x in victims_df["skills"]]
victims_df["priorities"] = [[ int(y) for y in x.split('-') ] for x in victims_df["priorities"]]
print(victims_df.head())
print("\n\n")

# Get Volunteers Data
volunteer_file = "volunteers_sample_v2.csv"
volunteers_df = pd.read_csv(volunteer_file)
volunteers_df["skills"] = [[snow.stem(y) for y in x.split('-')] for x in volunteers_df["skills"]]
volunteers_df_copy = pd.read_csv(volunteer_file)
volunteers_df_copy["skills"] = [x.split('-') for x in volunteers_df_copy["skills"]]
volunteers_df_copy.drop(columns=["can_serve"])
print(volunteers_df.head())

   id    help_needed                                skills       priorities
0  S1  Psychological  [medicin, drive, sing, danc, comedi]  [9, 2, 4, 4, 5]
1  S2           Food              [drive, farm, distribut]        [5, 9, 8]
2  S3  Transportaion                               [drive]              [6]
3  S4       Teaching            [teach, art, danc, scienc]     [9, 8, 7, 6]



   id Volunteer              skills  can_serve
0  V1    Singer              [sing]          2
1  V2    Artist         [act, sing]          2
2  V3    Driver  [drive, distribut]          2
3  V4    Doctor           [medicin]          2
4  V5    Farmer              [farm]          2


In [19]:
results = dict()
for i in range(victims_df.shape[0]):
    row = victims_df.iloc[i].values
    result = {}
    idx = row[0]
    result['help'] = row[1]
    result['skills'] = row[2]
    result['volunteers_allocated'] = []
    results[idx] = result
    
#results
#results['S1']['volunteers_allocated'].append('V1')


In [20]:
k= victims_df.shape[0]
colors = ['green','red','blue','yellow','orange']
clusters = {}
for i in range(k):
    victim = victims_df.iloc[i].to_dict() # they will be Centroid of the clusters
    points = []
    cluster = {
        'victim': victim,
        'points': points,
        'color': colors[i%len(colors)]
    }
    clusters[i] = cluster

In [21]:
clusters

{0: {'victim': {'id': 'S1',
   'help_needed': 'Psychological',
   'skills': ['medicin', 'drive', 'sing', 'danc', 'comedi'],
   'priorities': [9, 2, 4, 4, 5]},
  'points': [],
  'color': 'green'},
 1: {'victim': {'id': 'S2',
   'help_needed': 'Food',
   'skills': ['drive', 'farm', 'distribut'],
   'priorities': [5, 9, 8]},
  'points': [],
  'color': 'red'},
 2: {'victim': {'id': 'S3',
   'help_needed': 'Transportaion',
   'skills': ['drive'],
   'priorities': [6]},
  'points': [],
  'color': 'blue'},
 3: {'victim': {'id': 'S4',
   'help_needed': 'Teaching',
   'skills': ['teach', 'art', 'danc', 'scienc'],
   'priorities': [9, 8, 7, 6]},
  'points': [],
  'color': 'yellow'}}

In [22]:
def get_common_elements(v1,v2):
    return list(set(v1) & set(v2))

def remove_common_elements(a,b):
    return list(set(a) - set(b))

def get_distance(common, arr= None, priorities = None):
    if priorities is None or arr is None:
        return len(common)
    else:
        distance = np.sum([priorities[ix] for ix, val in enumerate(arr) if val in common])
        return distance

In [23]:
# E-step
# Parameters : X = dataset -> pd.DataFrame
#            : clusters = victimslist -> dictionary 
def AssignPointsToClusters(Volunteers, clusters):
    done = True
    for ix in range(Volunteers.shape[0]):
        dist = []
        common_elements_list = []
        cur_vol = Volunteers.iloc[ix].to_dict()
        for kx in range(k):
            common_elements = get_common_elements(cur_vol['skills'],clusters[kx]['victim']['skills'])
            common_elements_list.append(common_elements)
            # Get Distance score based on Priorities
            distance = get_distance(common_elements, clusters[kx]['victim']['skills'], clusters[kx]['victim']['priorities'])
            dist.append(distance)
        
        maximum = np.max(dist)
        max_list = [idx for idx,val in enumerate(dist) if val == maximum]
        # Randomly choose one victim to be assigned if multiple victims have same distance score
        cur_cluster = random.choice(max_list)
        print("dist:", dist, "max:", maximum, "argmax(selected):", cur_cluster)
        if maximum != 0:
            clusters[cur_cluster]['points'].append(cur_vol)
            # Remove common Elements from Victims so that further Volunteers are not assigned
            common_elements = common_elements_list[cur_cluster]
            priorities_list = clusters[cur_cluster]['victim']['priorities']
            priorities_list = [x for (i,x) in enumerate(priorities_list) if clusters[cur_cluster]['victim']['skills'][i] not in common_elements]
            clusters[cur_cluster]['victim']['priorities'] = priorities_list
            clusters[cur_cluster]['victim']['skills'] = remove_common_elements(clusters[cur_cluster]['victim']['skills'],common_elements)    
            if cur_vol['can_serve'] == 1:
                cur_vol['skills'] = remove_common_elements(cur_vol['skills'],common_elements)
                volunteers_df['skills'].iloc[ix] = cur_vol['skills']
            else:
                volunteers_df['can_serve'].iloc[ix] = cur_vol['can_serve'] - 1
            done = False
    return done
        

# M-step
def UpdateClusters(clusters,k):
    for kx in range(k):
        pts = np.array(clusters[kx]['points'])
        if pts.shape[0]>0:
            for ix in range(pts.shape[0]):
                idx = clusters[kx]['victim']['id']
                verbs_list = volunteers_df_copy['skills'].loc[volunteers_df_copy["id"] == pts[ix]['id']].to_list()
                pts[ix]['skills'] = verbs_list
                
                results[idx]['volunteers_allocated'].append(pts[ix])
                print("Volunteer:", pts[ix]["id"], "is assigned to Victim:", idx)
            clusters[kx]['points'] = [] #Clear the List

def model(X,clusters,k):
    done = False
    while not done:
        done = AssignPointsToClusters(X,clusters)
        UpdateClusters(clusters,k)
    
    

In [24]:
model(volunteers_df,clusters, k)
results

dist: [4, 0.0, 0.0, 0.0] max: 4.0 argmax(selected): 0
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 3
dist: [2, 13, 6, 0.0] max: 13.0 argmax(selected): 1
dist: [9, 0.0, 0.0, 0.0] max: 9.0 argmax(selected): 0
dist: [0.0, 9, 0.0, 0.0] max: 9.0 argmax(selected): 1
Volunteer: V1 is assigned to Victim: S1
Volunteer: V4 is assigned to Victim: S1
Volunteer: V3 is assigned to Victim: S2
Volunteer: V5 is assigned to Victim: S2
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 3
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 0
dist: [4, 0.0, 6, 0.0] max: 6.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
Volunteer: V3 is assigned to Victim: S3
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 1
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 0
dist: [0.0, 0.0, 0.0, 0.0] max: 0.0 argmax(selected): 2
dist: [0.0, 0.0, 0.0, 0.0] max

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


{'S1': {'help': 'Psychological',
  'skills': ['medicin', 'drive', 'sing', 'danc', 'comedi'],
  'volunteers_allocated': [{'id': 'V1',
    'Volunteer': 'Singer',
    'skills': [['singing']],
    'can_serve': 2},
   {'id': 'V4',
    'Volunteer': 'Doctor',
    'skills': [['medicine']],
    'can_serve': 2}]},
 'S2': {'help': 'Food',
  'skills': ['drive', 'farm', 'distribut'],
  'volunteers_allocated': [{'id': 'V3',
    'Volunteer': 'Driver',
    'skills': [['driving', 'distributing']],
    'can_serve': 2},
   {'id': 'V5',
    'Volunteer': 'Farmer',
    'skills': [['farming']],
    'can_serve': 2}]},
 'S3': {'help': 'Transportaion',
  'skills': ['drive'],
  'volunteers_allocated': [{'id': 'V3',
    'Volunteer': 'Driver',
    'skills': [['driving', 'distributing']],
    'can_serve': 1}]},
 'S4': {'help': 'Teaching',
  'skills': ['teach', 'art', 'danc', 'scienc'],
  'volunteers_allocated': []}}

In [18]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

json_obj = json.dumps(results, indent=2, cls=NpEncoder)
with open('results.json', 'w') as f:
    f.write(json_obj)


In [19]:
print(json_obj)

{
  "S1": {
    "help": "Psycologist",
    "verbs": [
      "psycotherapi",
      "medicin",
      "teach",
      "sing"
    ],
    "volunteers_allocated": [
      {
        "id": "V4",
        "volunteer": "Psycologist",
        "verbs": [
          [
            "medicine"
          ]
        ],
        "can_serve": 2
      },
      {
        "id": "V45",
        "volunteer": "Doctor",
        "verbs": [],
        "can_serve": 2
      },
      {
        "id": "V51",
        "volunteer": "Singer",
        "verbs": [],
        "can_serve": 2
      }
    ]
  },
  "S2": {
    "help": "Dancer",
    "verbs": [
      "danc",
      "drive",
      "comedi",
      "act",
      "sing",
      "paint"
    ],
    "volunteers_allocated": [
      {
        "id": "V2",
        "volunteer": "Dancer",
        "verbs": [
          [
            "acting",
            "singing"
          ]
        ],
        "can_serve": 2
      },
      {
        "id": "V32",
        "volunteer": "Aid work",
        "ver

## Connecting to DB

In [5]:
# !pip3 install pymongo
!pip3 install pymongo[srv]

Collecting dnspython<2.0.0,>=1.16.0; extra == "srv"
  Downloading dnspython-1.16.0-py2.py3-none-any.whl (188 kB)
Installing collected packages: dnspython
Successfully installed dnspython-1.16.0


You should consider upgrading via the 'c:\users\sathyamoorthy pandia\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


In [1]:
import sys
sys.path.append('C:/Users/sathyamoorthy pandia/AppData/Local/Programs/Python/Python37/Lib/site-packages')
from pymongo import MongoClient

In [2]:
client = MongoClient('mongodb+srv://annu:ammu@cluster0.9896d.mongodb.net/ask-foundation?retryWrites=true&w=majority')

In [3]:
db = client.get_database('ask-foundation')
# cursor = db[collection].find(query) 
# # Expand the cursor and construct the DataFrame
# df =  pd.DataFrame(list(cursor))
# df.head


In [5]:
print(db.list_collection_names)
print(list(db['volunteers'].find()))

<bound method Database.list_collection_names of Database(MongoClient(host=['cluster0-shard-00-01.9896d.mongodb.net:27017', 'cluster0-shard-00-02.9896d.mongodb.net:27017', 'cluster0-shard-00-00.9896d.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-bgjr73-shard-0', ssl=True), 'ask-foundation')>
[]


In [8]:
vol_list = list(db['volunteers'].find())
print(vol_list)

[]


In [9]:
vic_list = list(db['victims'].find())
print(vic_list)

[]
