In [1]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz



In [2]:
# FuzzyWuzzy is a library of Python which is used for string matching

In [3]:
# There are many methods of comparing string in python. Some of the main methods are:
#  Using regex
#  Simple compare
#  Using difflib

In [4]:
# FuzzyWuzzy algorithm is a powerful tool for finding similar strings even when they are 
#    not exactly the same. It helps us clean up messy data, link similar records, and improve 
#    tasks like spell-checking and word suggestions. By using FuzzyWuzzy, we can become better 
#    at comparing and matching strings in our programming projects. It’s a useful tool loved by 
#    programmers, data scientists, and researchers because it makes working with real-world data easier.
#   https://medium.com/@harikrishnanhari.india/understanding-fuzziness-exploring-the-fuzzywuzzy-algorithm-7e0b4b05f3d7


In [5]:
# The partial ratio algorithm considers partial matches and looks for the best 
#             matching substring within the longer string.
string1 = "apple"
string2 = "apples and bananas"
partial_ratio = fuzz.partial_ratio(string1, string2)
print(partial_ratio)  # Output: 100

100


In [6]:
string1 = "apple"
string2 = "apples and bananas"
best_matches = process.extract(string1, string2, limit=2)
print(best_matches)  

[('a', 90), ('p', 90)]


In [7]:
# process.extract() function that can be used to extract the best matches from a list of 
#       choices based on a target string. Here’s an example:

choices = ["apple", "banana", "orange", "pineapple"]
target = "appl"

best_matches = process.extract(target, choices, limit=3)
print(best_matches)  

[('pineapple', 90), ('apple', 89), ('banana', 22)]


In [8]:
# Order matters with partial ratio
# Check the similarity score
name = "Kurtis Pykes"
full_name = "Kurtis Pykes K D"

print(f"Partial ratio similarity score: {fuzz.partial_ratio(name, full_name)}")

# But order will not effect simple ratio if strings do not match
print(f"Simple ratio similarity score: {fuzz.ratio(name, full_name)}")

# Check the similarity score
name = "Kurtis Pykes"
full_name = "Kurtis K D Pykes"

print(f"Similarity score: {fuzz.partial_ratio(name, full_name)}")


Partial ratio similarity score: 100
Simple ratio similarity score: 86
Similarity score: 67


In [9]:
import pandas as pd
# https://www.datacamp.com/tutorial/fuzzy-string-python
# Creating a dataframe
dict_one = {
  "country": ["England", "Scotland", "Wales", "United Kingdom", "Northern Ireland"],
  "population_in_millions": [55.98, 5.45, 3.14, 67.33, 1.89]
}

dict_two = {
  "country": ["Northern Iland", "Wles", "Scotlnd", "Englnd", "United K."],
  "GDP_per_capita": [24900, 23882, 37460, 45101, 46510.28]
}

existing_data = pd.DataFrame(dict_one)
exported_data = pd.DataFrame(dict_two)

print(existing_data, exported_data, sep="\n\n")


            country  population_in_millions
0           England                   55.98
1          Scotland                    5.45
2             Wales                    3.14
3    United Kingdom                   67.33
4  Northern Ireland                    1.89

          country  GDP_per_capita
0  Northern Iland        24900.00
1            Wles        23882.00
2         Scotlnd        37460.00
3          Englnd        45101.00
4       United K.        46510.28


In [10]:
# Attempt to join the two dataframe
data = pd.merge(existing_data, exported_data, on="country", how="left")
print(data.head())

            country  population_in_millions  GDP_per_capita
0           England                   55.98             NaN
1          Scotland                    5.45             NaN
2             Wales                    3.14             NaN
3    United Kingdom                   67.33             NaN
4  Northern Ireland                    1.89             NaN


In [11]:
# Rename the misspelled columns
exported_data["country"] = exported_data["country"].apply(
  lambda x: process.extractOne(x, existing_data["country"], scorer=fuzz.partial_ratio)[0]
)

# Attempt to join the two dataframe
data = pd.merge(existing_data, exported_data, on="country", how="left")
print(data.head())

            country  population_in_millions  GDP_per_capita
0           England                   55.98        45101.00
1          Scotland                    5.45        37460.00
2             Wales                    3.14        23882.00
3    United Kingdom                   67.33        46510.28
4  Northern Ireland                    1.89        24900.00
