In [1]:
import pandas as pd

## Data preprocessing 

In [301]:
df = pd.read_excel("Inhabitant-term Data.xlsx")

In [302]:
df.head()

Unnamed: 0,City,Inhabitant Term
0,Chandrapur,Chandrapurkar
1,Gadchiroli,Gadchirolikar
2,Wardha,Wardhikar
3,Buldhana,Buldhanakar
4,Latur,Laturkar


In [312]:
suff_arr = []
for i in range(len(df)):
    city = df.iloc[i]['City']
    inhab = df.iloc[i]['Inhabitant Term']

    i = 0
    j = 0
    cityLen = len(city)
    inhabLen = len(inhab)

    while i<=cityLen-1 and j<=inhabLen-1 and city[i]==inhab[i]:
        i+=1
        j+=1
    
    suff_arr.append(inhab[j:])


In [313]:
df['Suffix'] = suff_arr

In [314]:
df.head()

Unnamed: 0,City,Inhabitant Term,Suffix,lattitude,longitude
0,Chandrapur,Chandrapurkar,kar,19.950816,79.298667
1,Gadchiroli,Gadchirolikar,kar,20.185397,80.003509
2,Wardha,Wardhikar,ikar,20.746494,78.599792
3,Buldhana,Buldhanakar,kar,20.53208,76.179911
4,Latur,Laturkar,kar,18.398339,76.562087


In [6]:
def get_geospatial_locations(place_name):
    access_token = 'pk.eyJ1IjoidmVkYW5naWl0YiIsImEiOiJjbGl2dHg0emkwY2R2M3NuNTF2c2Z2NHp4In0.yRK2FNlaIwNGujG51viPuw'
    base_url = "https://api.mapbox.com/geocoding/v5/mapbox.places/"
    params = {
        "access_token": access_token,
        "types": "place",
        "limit": 1
    }

    response = requests.get(base_url + f"{place_name}.json", params=params)
    
    if response.status_code == 200:
        data = response.json()
        locations = []
        
        for feature in data['features']:
            place = feature['place_name']
            longitude, latitude = feature['center']
            locations.append({
                'place': place,
                'latitude': latitude,
                'longitude': longitude
            })
        
        return locations
    else:
        print("Error:", response.status_code)
        return None

In [7]:
import requests

In [308]:
# Getting geoCoded data
lats = []
longs = []
for i in range(len(df)):
    geospatial_locations = get_geospatial_locations(df.iloc[i]['City'])
    if not len(geospatial_locations):
        print(f"No coordinate available for{df.iloc[i]['City']}")
        lat = None
        longi = None
    else:
        lat = geospatial_locations[0]['latitude']
        longi = geospatial_locations[0]['longitude']
    lats.append(lat)
    longs.append(longi)

No coordinate available forSindhudurg
No coordinate available forBarddhaman
No coordinate available forYankgaon
No coordinate available forMokokchung
No coordinate available forJharkhand
No coordinate available forMaharashtra
No coordinate available forMeghalaya
No coordinate available forMizoram
No coordinate available forNagaland
No coordinate available forRajasthan
No coordinate available forTelangana
No coordinate available forUttarakhand
No coordinate available forLakshadweep
No coordinate available forConnecticut
No coordinate available forMassachusetts
No coordinate available forHokkaido
No coordinate available forChungcheong
No coordinate available forChikkamagaluru
No coordinate available forMokokchung
No coordinate available forJhalawar
No coordinate available forBanaskantha
No coordinate available forGir Somnath
No coordinate available forMahisagar
No coordinate available forSabarkantha
No coordinate available forKurukshetra
No coordinate available forBardhaman
No coordinate

In [309]:
df['lattitude'] = lats 
df['longitude'] = longs

In [315]:
df.to_excel('data_updated.xlsx')

In [85]:
df.loc[295,'lattitude'] = 43.369420
df.loc[295,'longitude'] = 142.704834

In [110]:
df.to_excel('data_updated.xlsx')

In [109]:
df.head()

Unnamed: 0,City,Inhabitant Term,Ending Letter,Suffix,lattitude,longitude
0,Abohar,Abohari,r,i,30.145054,74.19566
1,Achalpur,Achalpuri,r,i,21.25467,77.508643
2,Adoor,Adoorite,r,ite,9.152966,76.735574
3,Agra,Agraan,a,an,27.175255,78.009816
4,Agartala,Agartalian,a,ian,23.831238,91.282382


In [112]:
df = pd.read_excel('data_updated.xlsx')

In [115]:
df.drop(df.columns[0],axis=1,inplace=True)

In [116]:
df.head()

Unnamed: 0,City,Inhabitant Term,Ending Letter,Suffix,lattitude,longitude
0,Abohar,Abohari,r,i,30.145054,74.19566
1,Achalpur,Achalpuri,r,i,21.25467,77.508643
2,Adoor,Adoorite,r,ite,9.152966,76.735574
3,Agra,Agraan,a,an,27.175255,78.009816
4,Agartala,Agartalian,a,ian,23.831238,91.282382


## Appraoch #1: K Nearest Neighbors

Method Details
- Get K nearest neighbors for the name
- There is no training step involved as such
- Distance is measured as Euclidean distance, i.e. using Lattitude and longitude
- (Can Try) -> Similarity of names as 3rd dimension
- For the closest names -> think of a approach to select the most relevant suffix

K => Hyperparam that we can fix

In [2]:
import math
def euclidean_dist(longi1,longi2,lat1,lat2):
    return math.sqrt(pow(longi1-longi2,2) + pow(lat1-lat2,2))

In [8]:
def get_inhab_name(place_name,k=7):
    dict1 = dict()

    geospatial_locations = get_geospatial_locations(place_name)

    if len(geospatial_locations)==0:
        print("Place doesn't exist")
        return 'ian'

    lat_name = geospatial_locations[0]['latitude']
    long_name = geospatial_locations[0]['longitude']

    for i in range(len(df)):
        dist1 = euclidean_dist(long_name,df.iloc[i]['longitude'],lat_name,df.iloc[i]['lattitude'])
        if dist1!=0:
            dict1[i] = dist1
    
    sorted_keys  = sorted(dict1,key=dict1.get)

    suffixes = []
    dist = []

    for j in range(k):
        suffixes.append(df.iloc[sorted_keys[j]]['Suffix'])
        dist.append(dict1[i])
    

    dict2 = dict()
    for i in range(len(suffixes)):
        if suffixes[i] in dict2:
            dict2[suffixes[i]]+=(1/dist[i])
        else:
            dict2[suffixes[i]]=(1/dist[i])

    # Getting the suffix with the most score
    possible_suffixes = sorted(dict2,key=dict2.get)
    return possible_suffixes[0]

In [22]:
df = pd.read_excel("data_updated_new.xlsx")

In [31]:
import random

# Define the range for random numbers (0 to n)
n = len(df)-1  # You can change this value to your desired upper limit

# Generate 20 random numbers between 0 and n
random_numbers = [random.randint(0, n) for _ in range(50)]

# Print the generated random numbers
count = 0
for num in random_numbers:
    pred_suff = get_inhab_name(df.iloc[num]['City'])
    if pred_suff==df.iloc[num]['Suffix']:
        count+=1

print(f"Accuracy: {count/len(random_numbers)*100} % ")

Place doesn't exist
Place doesn't exist
Accuracy: 14.000000000000002 % 


In [30]:
df['lattitude'] = df['lattitude'].apply(lambda x: float(x))

In [28]:
df['longitude'] = df['longitude'].apply(lambda x: float(x))

In [29]:
df.loc[170,'lattitude']= 11.7861

In [27]:
df.loc[758,'longitude'] = 88.3975

In [34]:
df.dtypes

Unnamed: 0           int64
City                object
Inhabitant Term     object
Suffix              object
lattitude          float64
longitude          float64
dtype: object

In [35]:
df.to_excel("data_updated_new.xlsx")

## Approach #2: Using Word encoding

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [228]:
# Import necessary libraries
import pandas as pd


In [229]:
data = pd.read_excel('data_updated_new.xlsx')

In [230]:
data['City'] = data['City'].apply(lambda x : x.lower())

In [231]:
data['Inhabitant Term'] = data['Inhabitant Term'].apply(lambda x: x.lower())

In [232]:
data = data.dropna()

In [233]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,City,Inhabitant Term,Suffix,lattitude,longitude
0,0,0,chandrapur,chandrapurkar,kar,19.950816,79.298667
1,1,1,gadchiroli,gadchirolikar,kar,20.185397,80.003509
2,2,2,wardha,wardhikar,ikar,20.746494,78.599792
3,3,3,buldhana,buldhanakar,kar,20.53208,76.179911
4,4,4,latur,laturkar,kar,18.398339,76.562087


In [234]:
X = data[['City','lattitude', 'longitude']]
y = data['Suffix']

In [235]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

In [236]:
# Convert categorical variables into numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_transformed = tfidf_vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(x.astype(str)), axis=1))
X_test_transformed = tfidf_vectorizer.transform(X_test.apply(lambda x: ' '.join(x.astype(str)), axis=1))

In [237]:
from sklearn.ensemble import GradientBoostingClassifier

In [238]:
model = RandomForestClassifier(n_estimators=50, random_state=3)

In [239]:
model.fit(X_train_transformed, y_train)

In [240]:
y_pred = model.predict(X_test_transformed)

In [241]:
y_pred1 = model.predict(X_train_transformed)

In [242]:
accuracy_score(y_train,y_pred1)*100

96.23567921440261

In [243]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 28.76%


In [372]:
new_Data = 'mumbai'
geospatial_locations = get_geospatial_locations(new_Data)
lat = geospatial_locations[0]['latitude']
lon1 = geospatial_locations[0]['longitude']

In [373]:
df1 = pd.DataFrame({'City':[new_Data],'lattitude':[lat],'longitude':[lon1]})

In [374]:
transformed_X = tfidf_vectorizer.transform(df1.apply(lambda x: ' '.join(x.astype(str)), axis=1))

In [375]:
print(f"Prediction : {new_Data + model.predict(transformed_X)[0]}")

Prediction : mumbaikar


## Appraoch 3: Using Character encoding

In [213]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [214]:
all_letters = set()
maxLen = 0
for i in range(len(data)):
    word1 = data.iloc[i]['City']
    word2 = data.iloc[i]['Suffix']
    for j in word1:
        all_letters.add(j)
    for j in word2:
        all_letters.add(j)

    maxLen = max(maxLen,len(word1),len(word2))

In [215]:
char_to_int = {char: i for i, char in enumerate(all_letters)}

In [216]:
def one_hot_encode(char):
    encoding = np.zeros(len(char_to_int))
    encoding[char_to_int[char]] = 1
    return encoding

In [217]:
def get_encode(word):
    word_encoding = [one_hot_encode(char) for char in word]
    while len(word_encoding) < maxLen:
        word_encoding.append(np.zeros(len(char_to_int)))
    return np.array(word_encoding).flatten()

In [218]:
cityEnc = []
suffEnc = []
for i in range(len(data)):
    cityEnc.append(get_encode(data.iloc[i]['City']))
    suffEnc.append(get_encode(data.iloc[i]['Suffix']))

In [220]:
X_word = np.array(cityEnc)

In [221]:
X_float1 = np.array(data['lattitude']).reshape(-1, 1)
X_float2 = np.array(data['longitude']).reshape(-1, 1)

In [222]:
X_combined = np.hstack((X_word, X_float1, X_float2))

In [223]:
y = np.array(suffEnc)

In [224]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [225]:
# Standardize the float features (mean=0, std=1)
scaler = StandardScaler()
X_train[:, -(X_float1.shape[1] + X_float2.shape[1]):] = scaler.fit_transform(X_train[:, -(X_float1.shape[1] + X_float2.shape[1]):])
X_test[:, -(X_float1.shape[1] + X_float2.shape[1]):] = scaler.transform(X_test[:, -(X_float1.shape[1] + X_float2.shape[1]):])

In [67]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)


In [68]:
print(f"Accuracy {accuracy*100:2f}%")

Accuracy 11.111111%


In [302]:
new_Data = 'bengal'
geospatial_locations = get_geospatial_locations(new_Data)
lat = geospatial_locations[0]['latitude']
lon1 = geospatial_locations[0]['longitude']

In [303]:
df1 = pd.DataFrame({'City':[new_Data],'lattitude':[lat],'longitude':[lon1]})

In [305]:
cityEnc_n = []
for i in range(len(df1)):
    cityEnc_n.append(get_encode(df1.iloc[i]['City']))

In [306]:
X_word = np.array(cityEnc_n)

In [307]:
X_float1 = np.array(df1['lattitude']).reshape(-1, 1)
X_float2 = np.array(df1['longitude']).reshape(-1, 1)

In [308]:
X_combined = np.hstack((X_word, X_float1, X_float2))

In [309]:
X_test = X_combined

In [310]:
X_test[:, -(X_float1.shape[1] + X_float2.shape[1]):] = scaler.transform(X_test[:, -(X_float1.shape[1] + X_float2.shape[1]):])

In [311]:
y_pred = rf_classifier.predict(X_test)

In [319]:
for i in range(len(y_pred.tolist()[0])):
    if y_pred.tolist()[0][i]>0:
        print(i)

18


In [None]:
list(all_letters)

In [322]:
list(all_letters)[18]

'i'

## Conclusion: 
2nd method performs the best with `28.76%` accuracy