In [5]:
import pandas as pd
import numpy as np
import chardet

# Detect file encoding
file_name = "data/PakistanSuicideAttacks.csv"
with open(file_name, "rb") as f:
    detect_result = chardet.detect(f.read(50000))
encoding = detect_result["encoding"]

In [12]:
# Find incosistent cities
data = pd.read_csv(file_name, encoding=encoding)
data.City = data.City.str.strip().str.lower()
city = data.City.unique()
city.sort()
city

array(['attock', 'bajaur agency', 'bannu', 'bhakkar', 'buner', 'chakwal',
       'chaman', 'charsadda', 'd. i khan', 'd.g khan', 'd.i khan',
       'dara adam khel', 'fateh jang', 'ghallanai, mohmand agency',
       'gujrat', 'hangu', 'haripur', 'hayatabad', 'islamabad',
       'jacobabad', 'karachi', 'karak', 'khanewal', 'khuzdar',
       'khyber agency', 'kohat', 'kuram agency', 'kurram agency',
       'lahore', 'lakki marwat', 'lasbela', 'lower dir', 'malakand',
       'mansehra', 'mardan', 'mohmand agency',
       'mosal kor, mohmand agency', 'multan', 'muzaffarabad',
       'north waziristan', 'nowshehra', 'orakzai agency', 'peshawar',
       'pishin', 'poonch', 'quetta', 'rawalpindi', 'sargodha',
       'sehwan town', 'shabqadar-charsadda', 'shangla', 'shikarpur',
       'sialkot', 'south waziristan', 'sudhanoti', 'sukkur', 'swabi',
       'swat', 'taftan', 'tangi, charsadda district', 'tank', 'taunsa',
       'tirah valley', 'totalai', 'upper dir', 'wagah', 'zhob'],
      dtype=

In [23]:
# "d. i khan" and "d.i khan" are the same city
# "kuram agency" and "kurram agency" are the same city
# Make them consistent
import fuzzywuzzy.process
inconsistent_cities = ["d.i khan", "kuram agency"]

def replace_matches(data, column, match_string, min_ratio=90):
    strings = data[column].unique()
    matches = fuzzywuzzy.process.extract(match_string, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
    close_matches = [match[0] for match in matches if match[1] >= min_ratio]
    print(f"close matches for {match_string} are {' '.join(close_matches)}")
    rows_of_close_matches = data[column].isin(close_matches)
    data.loc[rows_of_close_matches, column] = match_string
    
for city in inconsistent_cities:
    replace_matches(data, "City", city)

cities = data.City.unique()
cities.sort()
cities

close matches for d.i khan are d.i khan
close matches for kuram agency are kuram agency


array(['attock', 'bajaur agency', 'bannu', 'bhakkar', 'buner', 'chakwal',
       'chaman', 'charsadda', 'd.g khan', 'd.i khan', 'dara adam khel',
       'fateh jang', 'ghallanai, mohmand agency', 'gujrat', 'hangu',
       'haripur', 'hayatabad', 'islamabad', 'jacobabad', 'karachi',
       'karak', 'khanewal', 'khuzdar', 'khyber agency', 'kohat',
       'kuram agency', 'lahore', 'lakki marwat', 'lasbela', 'lower dir',
       'malakand', 'mansehra', 'mardan', 'mohmand agency',
       'mosal kor, mohmand agency', 'multan', 'muzaffarabad',
       'north waziristan', 'nowshehra', 'orakzai agency', 'peshawar',
       'pishin', 'poonch', 'quetta', 'rawalpindi', 'sargodha',
       'sehwan town', 'shabqadar-charsadda', 'shangla', 'shikarpur',
       'sialkot', 'south waziristan', 'sudhanoti', 'sukkur', 'swabi',
       'swat', 'taftan', 'tangi, charsadda district', 'tank', 'taunsa',
       'tirah valley', 'totalai', 'upper dir', 'wagah', 'zhob'],
      dtype=object)