This program cleans the data on the prediction of well functionality.

In [1]:
import pandas as pd
import numpy as np
import os
import pdb
from tqdm import tqdm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import collections
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
os.chdir("C:\\Users\\Xiaotong\\Desktop\\wells")

In [3]:
def load():
    df1 = pd.read_csv("training.csv", parse_dates=["date_recorded"])
    df2 = pd.read_csv("labels.csv")
    df3 = pd.read_csv("test.csv", parse_dates=["date_recorded"])
    df =pd.merge(df2, df1, on="id", how="left")
    return df3, df

In [4]:
col = "installer"

In [5]:
train, test = load()

In [6]:
def fill_nan(df):
    df[col] = df[col].fillna("0")
    df[col] = df[col].str.lower()
    df[col] = df[col].replace(["not know", "not kno"], "0")
    return df

In [7]:
train = fill_nan(train)
test = fill_nan(test)

In [8]:
train["installer"].head()

0          dmdd
1           dwe
2             0
3    finn water
4        bruder
Name: installer, dtype: object

In [9]:
def get_composite_names(df, col):
    # list of full name entities
    fullnames_dic = {x: '' for x in df[col]}  # if len(x.split())>=2}
    fullnames_list = [x for x in df[col]]  # if len(x.split())>=2]
    for key in fullnames_dic:
        for i in range(len(key.split())):
            fullnames_dic[key] += key.split()[i][0]
    counter = collections.Counter(fullnames_list)
    return fullnames_dic, fullnames_list, counter

In [10]:
train_names_dic, train_names_list, train_names_counter = get_composite_names(train, col)
test_names_dic, test_names_list, test_names_counter = get_composite_names(test, col)

In [11]:
def fuzzymatch_one(counter, tol_partial, tol_sort, tol_set):
    """ match composite names duplicate, starting with the most frequent ones"""
    sorted_keys_desc = [x[0] for x in counter.most_common()]
    sorted_keys_asc = [x[0] for x in sorted(counter.items(), key=lambda x: x[1])]
    matched = {}
    fuzzdic = {}
    for key in tqdm(sorted_keys_desc):
        if (key not in matched) and (key not in fuzzdic):
            matched[key] = [(key, counter[key])]
            fuzzdic[key] = (key, key, counter[key])
            for item in sorted_keys_asc:
                if (item != key) and (item not in fuzzdic):
                    p0 = fuzz.ratio(key, item)
                    p1 = fuzz.partial_ratio(key, item)
                    p2 = fuzz.token_sort_ratio(key, item)
                    p3 = fuzz.token_set_ratio(key, item)
                    if (p1>tol_partial and p3>tol_set) or (p2>tol_sort and p3>tol_set):
                        matched[key].append((item, counter[item]))
                        fuzzdic[item] = (key, item, counter[item])
    return matched, fuzzdic

In [12]:
def hand_match_method1(matched, fuzzdic, key_list, value):
    for key in key_list:
        fuzzdic[key] = (value, key, fuzzdic[key][2]) 
        matched[value].append((key, fuzzdic[key][2]))
        if key in matched:
            for key2 in matched[key]:
                fuzzdic[key2[0]] = (value, key2[0], fuzzdic[key2[0]][2])
            del matched[key]
    return matched, fuzzdic

def hand_match_method2(matched, fuzzdic, value, string):
    for key in fuzzdic:
        if (string in fuzzdic[key][0]) and (key != value):
            fuzzdic[key] = (value, key, fuzzdic[key][2])
            matched[value].append((key, fuzzdic[key][2]))
            if key in matched:
                del matched[key]
    return matched, fuzzdic

def hand_remove(matched, fuzzdic, oldkey, newkey, value):
    fuzzdic[value[0]] = (newkey, value[0], fuzzdic[value[0]][2])
    matched[oldkey].remove(value)
    if newkey in matched:
        matched[newkey].append((value[0], value[1]))
    else:
        matched[newkey] = [(value[0], value[1])]
    return matched, fuzzdic


In [13]:
train_match, train_fuzz = fuzzymatch_one(train_names_counter, 50, 50, 90)

100%|████████████████████████████████████████| 978/978 [01:29<00:00, 10.88it/s]


In [14]:
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "local government", \
                                      ("local government", 1))
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "local government", \
                                      ("village government", 1))

In [15]:
key_list = ['rc ch', 'rc c', 'roman church', 'roman catholic', 'roman cathoric -kilomeni', \
            'roman cathoric -same', 'rc cathoric', 'rulenge diocese', 'roman catholic rulenge diocese', \
            'roman ca', 'roman cathoric and water board', 'roman', 'roman cathoric same', \
            'roman catholic rulenge diocese', 'roman ca', 'roman cathoric -kilomeni', \
            'roman cathoric -same', 'roman church', 'roman catholic', 'rcchurch/cefa']
value = 'rc church'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['central govt', 'centr', 'tanzania government', 'tcrs /government', 'concern /government', \
            'adra /government', 'ministry of water', 'ministry of healthy', 'idara ya maji', 'wizara ya maji', \
            'gove', 'wachina', 'kuwait', 'kuwit']
value = 'government'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['consultant engineer', 'citizen engine', 'howard and humfrey consultant']
value = 'consulting engineer'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['local te', 'local contract', 'local fundi', 'local technical tec', 'local technical']
value = 'local  technician'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['sengerema water department', 'halmashauri ya wilaya sikonge', 'region water department', \
            'district water depar', 'distri', 'kigoma municipal']
value = 'local government'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['wananchi', 'village']
value = 'village council'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['priva', 'mzee shekhe', 'mzee ole', 'mzee maisha', 'mzee chacha', 'chacha', 'mzee kizunda', \
            'mzee matiti edwin']
value = 'private'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['finw', 'finwater', 'finwter']
value = 'fini water'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['unisef']
value = 'unicef'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['kkt']
value = 'kkkt'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

key_list = ['would bank', 'mileniam project']
value = 'world bank'
train_match, train_fuzz = hand_match_method1(train_match, train_fuzz, key_list, value)

string = 'local t'
value = 'kkkt church'
train_match, train_fuzz = hand_match_method2(train_match, train_fuzz, value, string)

string = 'italy'
value = 'italian government'
train_match, train_fuzz = hand_match_method2(train_match, train_fuzz, value, string)

string = 'would vission'
value = 'world vision'
train_match, train_fuzz = hand_match_method2(train_match, train_fuzz, value, string)

string = 'china henan contractor'
value = 'china henan construction'
train_match, train_fuzz = hand_match_method2(train_match, train_fuzz, value, string)

string = 'plan int'
value = 'plan internationa'
train_match, train_fuzz = hand_match_method2(train_match, train_fuzz, value, string)

string = 'rotery c'
value = 'rotary club'
train_match, train_fuzz = hand_match_method2(train_match, train_fuzz, value, string)

In [16]:
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "foreign government", \
                                      ('italy government', 1))

In [19]:
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "foreign government", \
                                      ('italian government', 2))
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "foreign government", \
                                      ('japan government', 1))
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "foreign government", \
                                      ('belgiam government', 2))
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "foreign government", \
                                      ('finland government', 7))
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "foreign government", \
                                      ('kuwait', 65))
train_match, train_fuzz = hand_remove(train_match, train_fuzz, "government", "foreign government", \
                                      ('kuwit', 1))

In [20]:
check = {key: value for key, value in train_match.items() if type(value)!=str}
[x for x in check.values()]

[[('dwe', 4351),
  ('ubalozi wa marekani/dwe', 1),
  ('consultant and dwe', 1),
  ('lgsp/dwe', 1),
  ('dwe/', 1),
  ('water aid/dwe', 3),
  ('dwe}', 7),
  ('rwe/dwe', 12),
  ('kkkt _ konde and dwe', 43)],
 [('0', 1081)],
 [('government', 476),
  ('colonial government', 1),
  ('british colonial government', 1),
  ('cebtral government', 1),
  ('tcrs /government', 1),
  ('adra /government', 1),
  ('central government/tlc', 1),
  ('concern /government', 2),
  ('isf/government', 4),
  ('tanzania government', 4),
  ('cipro/government', 5),
  ('central government', 170),
  ('central govt', 37),
  ('centr', 33),
  ('tanzania government', 4),
  ('tcrs /government', 1),
  ('concern /government', 2),
  ('adra /government', 1),
  ('ministry of water', 6),
  ('ministry of healthy', 1),
  ('idara ya maji', 32),
  ('wizara ya maji', 28),
  ('gove', 54),
  ('wachina', 22)],
 [('hesawa', 373), ('heasawa', 1)],
 [('rwe', 292),
  ('rwe/ community', 1),
  ('rwe/community', 1),
  ('rwe community', 1),
  ('

In [21]:
train_fuzz

{'0': ('0', '0', 1081),
 'aar': ('aar', 'aar', 1),
 'abasia': ('abasia', 'abasia', 5),
 'abdi mtili': ('abdi mtili', 'abdi mtili', 1),
 'abdul': ('abdul', 'abdul', 1),
 'abraham palanjo': ('abraham palanjo', 'abraham palanjo', 1),
 'ac': ('ac', 'ac', 3),
 'accra': ('accra', 'accra', 9),
 'acord': ('acord', 'acord', 1),
 'acra': ('acra', 'acra', 68),
 'act': ('act', 'act', 1),
 'action aid': ('action aid', 'action aid', 3),
 'action contre la faim': ('action contre la faim',
  'action contre la faim',
  3),
 'active kmk': ('active kmk', 'active kmk', 1),
 'active mkm': ('active mkm', 'active mkm', 7),
 'active tank co': ('co', 'active tank co', 2),
 'ad': ('ad', 'ad', 3),
 'adap': ('adap', 'adap', 1),
 'adb': ('adb', 'adb', 1),
 'adp': ('adp', 'adp', 4),
 'adp busangi': ('adp', 'adp busangi', 4),
 'adra': ('adra', 'adra', 42),
 'adra /community': ('community', 'adra /community', 7),
 'adra /government': ('government', 'adra /government', 1),
 'adra/ community': ('community', 'adra/ comm

In [None]:
def match_acronyms(matched, fuzzdic, df):
    
    return

In [None]:
# train_histo = np.histogram(list(trainnames_counter.values()), bins=len(trainnames_counter.values()))
# test_histo = np.histogram(list(testnames_counter.values()), bins=len(testnames_counter.values()))

In [None]:
# plt.bar(train_histo[1][:-1], train_histo[0])
# plt.ylim((0,30))
# plt.xlim(0,100)
# plt.xlabel("number of occurences of a name")
# plt.ylabel("number of names that occur x times")

In [None]:
# plt.bar(test_histo[1][:-1], test_histo[0])
# plt.ylim((0,30))
# plt.xlim(0,100)
# plt.xlabel("number of occurences of a name")
# plt.ylabel("number of names that occur x times")