### Importing necessary libraries

In [246]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
from collections import defaultdict
import Levenshtein

#### Importing Dataset

In [247]:
df = pd.read_csv("data.csv")

### Data Preprocessing

In [248]:
wordset = set()
wordlist = []
for i in df.message:
    wordset = wordset.union(word for word in i.split(" ")) # creating set of all words in the dataset
    wordlist += i.split(" ") # creating list of all words in the dataset to include frequency of each word
print(len(wordlist))
print(len(wordset))

2719
636


In [249]:
# %pip install levenshtein

In [250]:
# Function to group words with at most 2 different letters
def group_similar_words(words,score):
    grouped_words = defaultdict(list) # We use defaultdict to avoid key errors when adding values to the dictionary.
    used = set()
    
    for word1, word2 in combinations(words, 2):
        distance = Levenshtein.hamming(word1, word2)
        if distance <= 1 and len(word1) == len(word2):
            if word1 not in used and word2 not in used:
                grouped_words[word1].append(word2) # grouped_words[word1] will automatically create an empty list (due to defaultdict) if word1 does not already exist in the dictionary.
                used.add(word2)

    return grouped_words

In [253]:
ordered_wordlist = []
for i in pd.DataFrame(wordlist).value_counts().keys(): # sorting words by frequency
    ordered_wordlist.append(i[0]) # i is a tuple, i[0] is the word
len(ordered_wordlist)

636

In [254]:
grouped_ordered_words = dict(group_similar_words(ordered_wordlist,2))
print(grouped_ordered_words)
len(grouped_ordered_words)

{'terranix': ['tprranix', 'terramix', 'tekranix', 'tehranix'], 'sirenix': ['surenix', 'sirtnix', 'sirepix', 'airenix'], 'pluvia': ['pldvia'], 'ragex': ['ragux', 'rages', 'ragbx'], 'solarix': ['solarrx', 'solariy', 'solarih'], 'nebuz': ['qebuz', 'nebzz', 'ncbuz', 'neguz', 'nxbuz', 'yebuz'], 'quasar': ['qualar', 'qudsar'], 'gryphox': ['grypyox', 'grbphox', 'gryohox', 'gryphod', 'gryphov', 'gryphpx'], 'faerix': ['faemix', 'faerkx', 'faeyix', 'fjerix'], 'astron': ['ustron', 'asbron', 'aslron', 'astrln', 'astrob', 'astroh', 'astrrn'], 'nebulax': ['nebusax', 'nebuoax', 'nebulyx', 'nebulix', 'nebuhax', 'nxbulax', 'nibulax', 'ngbulax', 'febulax'], 'floraz': ['floran', 'fboraz', 'florgz', 'floraj'], 'cosmix': ['yosmix', 'cofmix', 'cosmia', 'cosmpx', 'hosmix'], 'vortex': ['vcrtex', 'voutex', 'vorcex', 'dortex'], 'cryptoz': ['crqptoz', 'cryftoz', 'crypooz', 'chyptoz'], 'pollex': ['pallex'], 'celestar': ['cehestar', 'celejtar', 'celistar', 'celwstar', 'cewestar', 'czlestar', 'ceaestar'], 'herox': 

125

In [255]:
def replace_words(text, word_dict):
    words = text.split()
    result = []
    
    # Iterate through each word in the text
    for word in words:
        replaced = False
        # Check each key-value pair in the dictionary
        for key, synonyms in word_dict.items():
            if word in synonyms:
                result.append(key)  # Replace word with the dictionary key
                replaced = True
                break
        if not replaced:
            result.append(word)  # If no replacement found, keep the original word
    
    # Join the result back into a string
    return ' '.join(result)

In [193]:
print(replace_words(df.message[69],grouped_ordered_words))
print(df.message[69])

astron novum quasar glixx
asbron novum quasar glixx


Here we can see how the data is changing after each step of the replacement of words.

In [256]:
for i in range(len(df.message)):
    df.loc[i,"message"] = replace_words(df.message[i],grouped_ordered_words)

In [257]:
df.head()

Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,no,Aquari
1,cosmix xeno nebuz orbitaz,5,yes,Zorblax
2,solarix glixx novum galaxum quasar,5,yes,Zorblax
3,arbor insectus petros ekos rootix nimbus,2,yes,Florian
4,mermax drakos lorix epikoz deitax,4,no,Faerix


In [259]:
from sklearn.preprocessing import OneHotEncoder

In [260]:
encoder = OneHotEncoder(sparse_output=False)  # Set sparse_output=False to return a dense array

encoded_tail = encoder.fit_transform(df[['tail']])

encoded_tail_df = pd.DataFrame(encoded_tail, columns=encoder.get_feature_names_out(['tail']))

# Concatenate the original DataFrame with the one-hot encoded DataFrame
df = pd.concat([df, encoded_tail_df], axis=1).drop('tail', axis=1)  # Optionally drop original 'tail' column
df.drop(["tail_no"],axis=1,inplace=True)

In [261]:
df.head()

Unnamed: 0,message,fingers,species,tail_yes
0,pluvia arbor aquos,4,Aquari,0.0
1,cosmix xeno nebuz orbitaz,5,Zorblax,1.0
2,solarix glixx novum galaxum quasar,5,Zorblax,1.0
3,arbor insectus petros ekos rootix nimbus,2,Florian,1.0
4,mermax drakos lorix epikoz deitax,4,Faerix,0.0


In [264]:
df['Message_Tokens'] = df['message'].apply(lambda x: x.split())
df.head()

Unnamed: 0,message,fingers,species,tail_yes,Message_Tokens
0,pluvia arbor aquos,4,Aquari,0.0,"[pluvia, arbor, aquos]"
1,cosmix xeno nebuz orbitaz,5,Zorblax,1.0,"[cosmix, xeno, nebuz, orbitaz]"
2,solarix glixx novum galaxum quasar,5,Zorblax,1.0,"[solarix, glixx, novum, galaxum, quasar]"
3,arbor insectus petros ekos rootix nimbus,2,Florian,1.0,"[arbor, insectus, petros, ekos, rootix, nimbus]"
4,mermax drakos lorix epikoz deitax,4,Faerix,0.0,"[mermax, drakos, lorix, epikoz, deitax]"


Using Word2Vec to convert words into vectors.

In [266]:
from gensim.models import Word2Vec

In [267]:
w2v_model = Word2Vec(
    sentences=df['Message_Tokens'],   # Tokenized messages
    vector_size=20,                   # Word vector size (20 dimensions)
    window=3,                         # Context window size
    min_count=1,                      # Include all words (min frequency = 1)
    sg=0,                             # Use CBOW (faster for smaller data)
)

# Train the Word2Vec model on the messages
w2v_model.train(df['Message_Tokens'], total_examples=len(df['Message_Tokens']), epochs=15)


(19521, 40785)

In [272]:
def sentence_vector(tokens, model):
    # If a token is in the model's vocabulary, get its vector; otherwise, ignore it
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)  # Return a zero vector if no words are found
    else:
        return np.mean(vectors, axis=0)  # Average the word vectors to get the sentence vector


X_text = np.array(df['Message_Tokens'].apply(lambda tokens: sentence_vector(tokens, w2v_model)).tolist())
X_text.shape

(500, 20)

X_text is an array where each row is the averaged vector representation of an alien message.

### Combining Features

In [273]:
X_fingers = df['fingers'].values.reshape(-1, 1)
X_tail = df['tail_yes'].values.reshape(-1, 1)

# Combine all features (Text, Number of Fingers, Tail)
X = np.hstack([X_text, X_fingers, X_tail])
X.shape

(500, 22)

In [274]:
from sklearn.model_selection import train_test_split

# Define target (Species)
y = df['species']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [275]:
y_train

72     Emotivor
182     Quixnar
131     Zorblax
410     Florian
193     Florian
         ...   
106     Quixnar
270    Emotivor
348     Zorblax
435     Florian
102     Sentire
Name: species, Length: 450, dtype: object

In [277]:
from sklearn.preprocessing import LabelEncoder
# Initialize the label encoder
label_encoder = LabelEncoder()

# Encode the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [278]:
y_train_encoded

array([2, 7, 9, 4, 4, 5, 7, 6, 2, 7, 6, 7, 4, 1, 3, 1, 8, 5, 3, 6, 0, 4,
       2, 9, 5, 4, 5, 9, 1, 3, 8, 7, 7, 4, 6, 7, 9, 3, 9, 1, 2, 0, 8, 7,
       7, 1, 7, 7, 1, 5, 4, 7, 4, 2, 8, 0, 2, 9, 8, 7, 0, 1, 6, 8, 4, 3,
       2, 4, 5, 6, 2, 6, 7, 4, 2, 9, 2, 1, 2, 3, 8, 7, 9, 7, 2, 8, 6, 5,
       0, 6, 9, 0, 1, 1, 4, 8, 0, 0, 5, 1, 6, 7, 6, 4, 2, 7, 4, 4, 1, 3,
       2, 4, 0, 6, 7, 1, 7, 5, 3, 8, 8, 9, 3, 2, 3, 9, 9, 3, 7, 3, 2, 6,
       7, 5, 7, 0, 8, 4, 0, 8, 2, 9, 8, 3, 3, 3, 5, 7, 4, 6, 8, 0, 2, 5,
       6, 5, 9, 5, 5, 0, 6, 1, 1, 2, 9, 9, 4, 9, 6, 1, 5, 3, 2, 9, 1, 0,
       7, 0, 8, 2, 8, 0, 9, 6, 4, 5, 5, 1, 2, 1, 5, 7, 7, 2, 4, 9, 9, 9,
       2, 1, 4, 2, 5, 2, 1, 9, 7, 8, 7, 7, 0, 7, 1, 6, 1, 5, 1, 2, 1, 9,
       2, 4, 9, 0, 6, 5, 2, 5, 4, 7, 1, 0, 5, 1, 4, 9, 4, 6, 0, 6, 6, 0,
       1, 7, 6, 8, 8, 4, 8, 5, 9, 3, 7, 7, 2, 1, 5, 9, 2, 7, 5, 5, 4, 4,
       2, 8, 7, 1, 0, 0, 1, 3, 4, 2, 4, 0, 5, 7, 2, 5, 3, 6, 6, 6, 2, 5,
       5, 3, 2, 9, 3, 5, 1, 8, 1, 2, 4, 4, 3, 8, 4,

In [281]:
from sklearn import svm

In [283]:
model = svm.SVC(kernel='linear', C=25,gamma='auto')
model.fit(X_train, y_train_encoded)

In [284]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 90.00%


In [286]:
x_test_df = pd.read_csv("test.csv")

In [287]:
wordset_test = set()
wordlist_test = []
for i in x_test_df.message:
    wordset_test = wordset_test.union(word for word in i.split(" "))
    wordlist_test += i.split(" ")
len(wordset_test)

434

In [288]:
def replace_words(text, word_dict):
    words = text.split()
    result = []
    
    # Iterate through each word in the text
    for word in words:
        replaced = False
        # Check each key-value pair in the dictionary
        for key in word_dict.keys():
            if Levenshtein.hamming(word, key) <= 1 and len(word) == len(key):
                result.append(key)  # Replace word with the dictionary key
                replaced = True
                break
        if not replaced:
            result.append(word)  # If no replacement found, keep the original word
    
    # Join the result back into a string
    return ' '.join(result)

In [289]:
for i in range(len(x_test_df.message)):
    x_test_df.loc[i,"message"] = replace_words(x_test_df.message[i],grouped_ordered_words)

In [290]:
wordset_test = set()
wordlist_test = []
for i in x_test_df.message:
    wordset_test = wordset_test.union(word for word in i.split(" "))
    wordlist_test += i.split(" ")
len(wordset_test)


129

In [291]:
encoder = OneHotEncoder(sparse_output=False)  # Set sparse_output=False to return a dense array

encoded_tail_test = encoder.fit_transform(x_test_df[['tail']])

encoded_tail_df_test = pd.DataFrame(encoded_tail_test, columns=encoder.get_feature_names_out(['tail']))

# Concatenate the original DataFrame with the one-hot encoded DataFrame
df_test = pd.concat([x_test_df, encoded_tail_df_test], axis=1).drop('tail', axis=1)  # Optionally drop original 'tail' column
df_test.drop(["tail_no"],axis=1,inplace=True)

In [292]:
df_test.head()

Unnamed: 0,message,fingers,tail_yes
0,zephyr terram nimbus terram faunar foliar,2,0.0
1,joyzor gleex luvium calmox shockus blissam,4,1.0
2,aquos arbor ventus,4,1.0
3,nympha nympha epikoz nympha mythox mythox mythox,3,0.0
4,deitax sirenix fabulon,4,1.0


In [293]:
df_test['Message_Tokens'] = df_test['message'].apply(lambda x: x.split())
df_test.head()

Unnamed: 0,message,fingers,tail_yes,Message_Tokens
0,zephyr terram nimbus terram faunar foliar,2,0.0,"[zephyr, terram, nimbus, terram, faunar, foliar]"
1,joyzor gleex luvium calmox shockus blissam,4,1.0,"[joyzor, gleex, luvium, calmox, shockus, blissam]"
2,aquos arbor ventus,4,1.0,"[aquos, arbor, ventus]"
3,nympha nympha epikoz nympha mythox mythox mythox,3,0.0,"[nympha, nympha, epikoz, nympha, mythox, mytho..."
4,deitax sirenix fabulon,4,1.0,"[deitax, sirenix, fabulon]"


In [296]:
X_text_test = np.array(df_test['Message_Tokens'].apply(lambda tokens: sentence_vector(tokens, w2v_model)).tolist())


In [297]:
X_text_test.shape

(299, 20)

In [298]:
X_fingers_test = df_test['fingers'].values.reshape(-1, 1)
X_tail_test = df_test['tail_yes'].values.reshape(-1, 1)

# Combine all features (Text, Number of Fingers, Tail)
X_test = np.hstack([X_text_test, X_fingers_test, X_tail_test])


In [239]:
y_pred_test = model.predict(X_test)

In [241]:
y_pred_test_species = label_encoder.inverse_transform(y_pred_test)

In [244]:
df_predictions = pd.DataFrame(y_pred_test_species, columns=['species'])
df_predictions

Unnamed: 0,species
0,Aquari
1,Sentire
2,Florian
3,Faerix
4,Mythron
...,...
294,Mythron
295,Nexoon
296,Mythron
297,Quixnar


In [245]:

# Save the DataFrame to a CSV file
df_predictions.to_csv('svm_predictions.csv', index=False)

print("Predictions saved to 'svm_predictions.csv'.")

Predictions saved to 'svm_predictions.csv'.
