## Import libraries

In [1]:
import pandas as pd
import numpy as np

import jellyfish
import seaborn as sns

## Import database with raw info

In [2]:
data_routes_raw = pd.read_csv('routes_info_raw.csv')
data_routes_raw.head()

Unnamed: 0,grade,name,sector,ascents,fos_ratio,recommendations,stars
0,7a,Freedom is a Battle,Trebenna West,781,66 %,10 %,3
1,7a,Karınca,Magara,699,55 %,16 %,4
2,7b,Lycian Highway,Trebenna West,645,70 %,13 %,4
3,6b+,Saxafon,Sarkit,591,66 %,20 %,4
4,6b,Nirvana,Magara,589,78 %,10 %,3


## Cleaning sectors name

List with sectors' name.

In [3]:
list_sectors = (data_routes_raw['sector'].sort_values().unique()).tolist()
list_sectors[:5]

['Akdeniz', 'Akyalar', 'Alaaddin', 'Alabalik', 'Alabalik Balcon']

### First step: searching misspells. Edge — 2 symbols. 

In [4]:
df_sectors_similarity = pd.DataFrame(columns=list_sectors, index=list_sectors, dtype='float')

Using # levenshtein_distance — 2

In [5]:
# levenshtein_distance — 2

for i in range(len(list_sectors)):
    for j in range(len(list_sectors)):
        if i >= j:
            df_sectors_similarity.iloc[i, j] = (
                jellyfish.levenshtein_distance(
                    list_sectors[i].lower(),
                    list_sectors[j].lower())
            )
        else:
            df_sectors_similarity.iloc[i, j] = np.nan

In [6]:
df_sectors_similarity.head()

Unnamed: 0,Akdeniz,Akyalar,Alaaddin,Alabalik,Alabalik Balcon,Alabalik Balkon,Alabalik balkon 4,Alabalık Balkon 1,Alcyone,Alibalik,...,leftcave,magara right,new,okuzini,okuzini cave,rusgarli bahce,sincap 2,trabenna new,trebbana,trebenna middle
Akdeniz,0.0,,,,,,,,,,...,,,,,,,,,,
Akyalar,5.0,0.0,,,,,,,,,...,,,,,,,,,,
Alaaddin,6.0,6.0,0.0,,,,,,,,...,,,,,,,,,,
Alabalik,6.0,5.0,4.0,0.0,,,,,,,...,,,,,,,,,,
Alabalik Balcon,13.0,11.0,10.0,7.0,0.0,,,,,,...,,,,,,,,,,


Collecting similar pairs in tuple with levenshtein_distance value.

In [7]:
LOW_EDGE = 2

jellyfish_filter = (df_sectors_similarity.values <= LOW_EDGE) & (df_sectors_similarity.values != 0)

similar_pairs_leven_dist = [
    (
        df_sectors_similarity.index[x],
        df_sectors_similarity.columns[y],
        df_sectors_similarity.iloc[x, y]
    )
    for x, y in zip(*np.where(jellyfish_filter))
]

In [8]:
similar_pairs_leven_dist

[('Alabalik Balkon', 'Alabalik Balcon', 1.0),
 ('Alabalik balkon 4', 'Alabalik Balkon', 2.0),
 ('Alabalık Balkon 1', 'Alabalik balkon 4', 2.0),
 ('Alibalik', 'Alabalik', 1.0),
 ('Echoes sağ2', 'Echoes Sag', 2.0),
 ('Gizmo 1', 'Gizmo', 2.0),
 ('Gizmo 2', 'Gizmo', 2.0),
 ('Gizmo 2', 'Gizmo 1', 1.0),
 ('Gizmo 3', 'Gizmo', 2.0),
 ('Gizmo 3', 'Gizmo 1', 1.0),
 ('Gizmo 3', 'Gizmo 2', 1.0),
 ('Güzel Manzara', 'Günzel Manzara', 1.0),
 ('Kekik', 'Geyik', 2.0),
 ('Kulluin Teras', 'Kulluin Taras', 1.0),
 ('Küllüin', 'Küllin1', 2.0),
 ('Ottoman', 'Ottaman', 1.0),
 ('Rüzgarli Bahçe', 'Ruzgarli Bahce', 2.0),
 ('Rüzgarli Bahçe', 'Rüzgarl? Bahçe', 1.0),
 ('Sincap 3', 'Sincap', 2.0),
 ('Trebena', 'Trabenna', 2.0),
 ('Trebene', 'Trebena', 1.0),
 ('Trebenna', 'Trabenna', 1.0),
 ('Trebenna', 'Trebena', 1.0),
 ('Trebenna', 'Trebene', 2.0),
 ('Trebenna West', 'Trebena West', 1.0),
 ('Trebenna West', 'Trebenna Ost', 2.0),
 ('Trebenna West 2', 'Trebenna West', 2.0),
 ('Trebenna West 4', 'Trebenna West', 2.0),

In [9]:
len(similar_pairs_leven_dist)

51

Creating new df with columns for clean sectors' and routes' name.

In [10]:
data_routes_sector_clean = data_routes_raw.copy()
data_routes_sector_clean.insert(loc=3, column='sector_clean', value=None)
data_routes_sector_clean.insert(loc=2, column='name_clean', value=None)

data_routes_sector_clean.head()

Unnamed: 0,grade,name,name_clean,sector,sector_clean,ascents,fos_ratio,recommendations,stars
0,7a,Freedom is a Battle,,Trebenna West,,781,66 %,10 %,3
1,7a,Karınca,,Magara,,699,55 %,16 %,4
2,7b,Lycian Highway,,Trebenna West,,645,70 %,13 %,4
3,6b+,Saxafon,,Sarkit,,591,66 %,20 %,4
4,6b,Nirvana,,Magara,,589,78 %,10 %,3


Function for changing wrong names. It will help us in the two steps.

In [11]:
def sector_name_revers (similar_pairs, column_name):
    """
    Function for changing wrong names in sector's column.
    :param similar_pairs: list of tuples with similar names of sectors
    :return: None
    """
    # Creating dictionary with 'wrong_value': ['true_value', ascents'_different]}

    similar_pairs_dict = {}

    for pair_sectors in similar_pairs:
        x = pair_sectors[0]
        y = pair_sectors[1]
    
        x_ascents = (
            data_routes_sector_clean[data_routes_sector_clean[column_name] == x]['ascents']
            .sum())
        y_ascents = (
            data_routes_sector_clean[data_routes_sector_clean[column_name] == y]['ascents']
            .sum())
    
        difference = abs(x_ascents - y_ascents)
        
        # if x-name is more popular than y-name,
        # it has more ascents, and we will remember in dict
        # y-name as a key and x-name as right name
    
        if x_ascents > y_ascents:
   
            try:
                # if dict already has this sector as a key,
                # we should check more popular sectors name variation
                
                if difference > similar_pairs_dict[y][1]:
                    similar_pairs_dict[y] = [x, difference]
            except:
                # if not, just write new key and value
                
                similar_pairs_dict[y] = [x, difference]
    
        else:
    
            try:
                if difference > similar_pairs_dict[x][1]:
                    similar_pairs_dict[x] = [y, difference]
            except:
                similar_pairs_dict[x] = [y, difference]
                
                
        # cleaning-part of function

        for index, row in data_routes_sector_clean.iterrows():
            sector_old_name = row[column_name]
        
            if sector_old_name in similar_pairs_dict:
                data_routes_sector_clean.loc[index, 'sector_clean'] = similar_pairs_dict[sector_old_name][0]
            else:
                data_routes_sector_clean.loc[index, 'sector_clean'] = sector_old_name
        

Fill sector_clean column.

In [12]:
# use function to find the most popular sectors name variation

sector_name_revers(similar_pairs_leven_dist, 'sector')

In [13]:
data_routes_sector_clean.head()

Unnamed: 0,grade,name,name_clean,sector,sector_clean,ascents,fos_ratio,recommendations,stars
0,7a,Freedom is a Battle,,Trebenna West,Trebenna West,781,66 %,10 %,3
1,7a,Karınca,,Magara,Magara,699,55 %,16 %,4
2,7b,Lycian Highway,,Trebenna West,Trebenna West,645,70 %,13 %,4
3,6b+,Saxafon,,Sarkit,Sarkit,591,66 %,20 %,4
4,6b,Nirvana,,Magara,Magara,589,78 %,10 %,3


### Second step: counting Jaro-Winkler Similarity. Edge — 85%. 


Cross table with sectors' name and their similarity percent.

In [14]:
df_sectors_similarity = pd.DataFrame(columns=list_sectors, index=list_sectors, dtype='float')

Using # Jaro-Winkler Similarity — 85%

In [15]:
# Jaro-Winkler Similarity — 85%

for i in range(len(list_sectors)):
    for j in range(len(list_sectors)):
        if i >= j:
            df_sectors_similarity.iloc[i, j] = (
                jellyfish.jaro_winkler_similarity(
                    list_sectors[i].lower(),
                    list_sectors[j].lower())
            )
        else:
            df_sectors_similarity.iloc[i, j] = np.nan

In [16]:
df_sectors_similarity.head()

Unnamed: 0,Akdeniz,Akyalar,Alaaddin,Alabalik,Alabalik Balcon,Alabalik Balkon,Alabalik balkon 4,Alabalık Balkon 1,Alcyone,Alibalik,...,leftcave,magara right,new,okuzini,okuzini cave,rusgarli bahce,sincap 2,trabenna new,trebbana,trebenna middle
Akdeniz,1.0,,,,,,,,,,...,,,,,,,,,,
Akyalar,0.52381,1.0,,,,,,,,,...,,,,,,,,,,
Alaaddin,0.607143,0.607143,1.0,,,,,,,,...,,,,,,,,,,
Alabalik,0.511905,0.607143,0.825,1.0,,,,,,,...,,,,,,,,,,
Alabalik Balcon,0.431746,0.615873,0.652778,0.906667,1.0,,,,,,...,,,,,,,,,,


Collecting similar pairs in tipple with jaro_winkler_similarity value.

In [17]:
LOW_EDGE = 0.85

jellyfish_filter = (df_sectors_similarity.values >= LOW_EDGE) & (df_sectors_similarity.values != 1)

similar_pairs_jw_similarity = [
    (
        df_sectors_similarity.index[x],
        df_sectors_similarity.columns[y],
        df_sectors_similarity.iloc[x, y]
    )
    for x, y in zip(*np.where(jellyfish_filter))
]

In [18]:
similar_pairs_jw_similarity

[('Alabalik Balcon', 'Alabalik', 0.9066666666666666),
 ('Alabalik Balkon', 'Alabalik', 0.9066666666666666),
 ('Alabalik Balkon', 'Alabalik Balcon', 0.9733333333333334),
 ('Alabalik balkon 4', 'Alabalik', 0.8941176470588236),
 ('Alabalik balkon 4', 'Alabalik Balcon', 0.9513725490196078),
 ('Alabalik balkon 4', 'Alabalik Balkon', 0.9764705882352941),
 ('Alabalık Balkon 1', 'Alabalik', 0.8573529411764705),
 ('Alabalık Balkon 1', 'Alabalik Balcon', 0.9262745098039216),
 ('Alabalık Balkon 1', 'Alabalik Balkon', 0.9513725490196078),
 ('Alabalık Balkon 1', 'Alabalik balkon 4', 0.9529411764705882),
 ('Alibalik', 'Alabalik', 0.8952380952380953),
 ('Anatolia Right', 'Anatolia', 0.9142857142857143),
 ('Anatolia Sag', 'Anatolia', 0.9333333333333333),
 ('Anatolia Sag', 'Anatolia Right', 0.9095238095238095),
 ('Antalya', 'Anatolia', 0.8507936507936509),
 ('Cesme Sol (Left)', 'Cesme', 0.8625),
 ('Dragonfly', 'Dragon', 0.9333333333333333),
 ('Echoes Rıght', 'Echoes', 0.9),
 ('Echoes Sag', 'Echoes', 0.

In [19]:
len(similar_pairs_jw_similarity)

325

Fill sector_clean column.

In [20]:
sector_name_revers(similar_pairs_jw_similarity, column_name='sector_clean')

In [21]:
data_routes_sector_clean.head()

Unnamed: 0,grade,name,name_clean,sector,sector_clean,ascents,fos_ratio,recommendations,stars
0,7a,Freedom is a Battle,,Trebenna West,Trebenna West,781,66 %,10 %,3
1,7a,Karınca,,Magara,Magara,699,55 %,16 %,4
2,7b,Lycian Highway,,Trebenna West,Trebenna West,645,70 %,13 %,4
3,6b+,Saxafon,,Sarkit,Sarkit,591,66 %,20 %,4
4,6b,Nirvana,,Magara,Magara,589,78 %,10 %,3


Check our progress.

In [22]:
print(
    f"Sectors before cleaning: {data_routes_sector_clean['sector'].nunique()}\n"
    f"Sectors after cleaning: {data_routes_sector_clean['sector_clean'].nunique()}"
)

Sectors before cleaning: 143
Sectors after cleaning: 65


Save sector_clean_name in file.

In [23]:
data_routes_sector_clean.to_csv('routes_info_sector_clean.csv', encoding='utf-8', index=False)