In [2]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split



  from .autonotebook import tqdm as notebook_tqdm


In [51]:
data=pd.read_csv('assignment_A.csv')

In [52]:
#distribution of classes
data["label"].value_counts()

0    2952
1    1048
Name: label, dtype: int64

In [53]:
data=data.dropna()

In [54]:
#pre-process sentences
def pre_process(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

data['proc_sent1'] = data['sentence1'].map(pre_process)
data['proc_sent2'] = data['sentence2'].map(pre_process)

In [55]:
#Embed text into features
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings_sentence1 = model.encode(data['proc_sent1'])
embeddings_sentence2 = model.encode(data['proc_sent2'])

data['embeddings_sentence1'] = list(embeddings_sentence1)
data['embeddings_sentence2'] = list(embeddings_sentence2)
data['combined_embeddings'] = data['embeddings_sentence1'] + data['embeddings_sentence2']

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import numpy as np
import textdistance


def calculate_similarity(row):
    
    cosine_sim=cosine_similarity([row['embeddings_sentence1']], [row['embeddings_sentence2']])[0][0]
    # Jaccard Similarity
    words1 = set(word_tokenize(row['proc_sent1']))
    words2 = set(word_tokenize(row['proc_sent2']))

    intersection = words1.intersection(words2)
    union = words1.union(words2)
    jaccard_sim = float(len(intersection)) / (len(union)+0.00000001)

    # Levenshtein Distance
    levenshtein_dist = textdistance.levenshtein(row['proc_sent1'], row['proc_sent2'])
    return cosine_sim, jaccard_sim, levenshtein_dist

In [58]:
val = data.apply(lambda row: calculate_similarity(row), axis=1)
list_values = [list(t) for t in val]
column_names = ['cosine_sim', 'jaccard_sim', 'levenshtein_dist']
new_df = pd.DataFrame(list_values, columns=column_names) 

In [71]:
data=pd.concat([data,new_df],axis=1)

In [97]:
data['combined_features'] = data.apply(
    lambda row: np.hstack((row['combined_embeddings'], row['cosine_sim'],row['jaccard_sim'],row['levenshtein_dist'])), axis=1)


In [99]:
data

Unnamed: 0,sentence1,sentence2,label,proc_sent1,proc_sent2,embeddings_sentence1,embeddings_sentence2,combined_embeddings,cosine_sim,jaccard_sim,levenshtein_dist,combined_features
0,Male issues and I need a Male to talk to,I need a male to talk too,1,male issue need male talk,need male talk,"[-0.03350903, 0.07108113, 1.3455219, 0.5637248...","[0.018803466, -0.012148427, 1.9551451, 0.56492...","[-0.014705565, 0.058932707, 3.300667, 1.128645...",0.962122,0.750000,11,"[-0.014705564826726913, 0.05893270671367645, 3..."
1,the past,"i have bad feet, and i have to take jobs like ...",0,past,bad foot take job like door dash lyft grub hub...,"[0.0040588663, -0.12999211, 2.2136116, 0.22531...","[0.10122915, 1.0644948, 0.59909374, 0.1839185,...","[0.10528802, 0.9345027, 2.8127053, 0.40923062,...",0.214481,0.000000,63,"[0.10528802126646042, 0.9345027208328247, 2.81..."
2,I feel like my gf is mad at me,I feel like I'm being deceived in some way by ...,1,feel like gf mad,feel like deceived way someone began dating ma...,"[0.48517704, -0.12296127, 2.020929, 0.2776249,...","[0.04451518, 0.4803249, 0.963113, 0.18849412, ...","[0.52969223, 0.35736364, 2.9840422, 0.46611902...",0.526352,0.111111,93,"[0.5296922326087952, 0.35736364126205444, 2.98..."
3,feeling,i feel like nothing is real anymore,0,feeling,feel like nothing real anymore,"[-0.09597687, -0.9879196, 2.6060772, 0.846855,...","[0.5703408, 0.2704484, 2.1174276, 0.04640493, ...","[0.47436395, -0.71747124, 4.723505, 0.89325994...",0.527320,0.000000,23,"[0.47436395287513733, -0.7174712419509888, 4.7..."
4,"I love my boyfriend, soon to be husband. I jus...",I constantly think my kids would be better off...,0,love boyfriend soon husband think happy anynore,constantly think kid would better without,"[-0.3272178, 0.46564952, 2.4937224, 0.43840873...","[-0.052636694, 0.41056067, 1.3571085, 0.071444...","[-0.37985447, 0.8762102, 3.850831, 0.50985295,...",0.411485,0.083333,40,"[-0.3798544704914093, 0.8762102127075195, 3.85..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,relationship confusion,okboomer,0,relationship confusion,okboomer,"[-0.012656743, 0.34704006, 1.7000375, 0.046574...","[-0.20120949, 0.5136952, 1.7885517, 0.16150445...","[-0.21386623, 0.86073524, 3.4885893, 0.2080789...",0.397363,0.000000,20,"[-0.2138662338256836, 0.8607352375984192, 3.48..."
3996,my life makes me feelhopeful,my relationship makes me feel __depressed__,0,life make feelhopeful,relationship make feel __depressed__,"[0.078545794, -0.2169184, 2.392569, 0.6493329,...","[0.11857631, 0.18302812, 1.4769051, 0.11034703...","[0.1971221, -0.033890277, 3.8694742, 0.7596799...",0.502150,0.166667,22,"[0.19712209701538086, -0.03389027714729309, 3...."
3997,family,bored from quarantine,0,family,bored quarantine,"[-0.21151088, -0.28410962, 2.0763443, -0.02894...","[0.1866005, 0.13393791, 1.6164118, 0.70627993,...","[-0.024910375, -0.15017171, 3.6927562, 0.67733...",0.361153,0.000000,14,"[-0.024910375475883484, -0.15017171204090118, ..."
3998,fighting with husband both drunk unresolved pa...,im not secure about my self because im over we...,0,fighting husband drunk unresolved past issue d...,im secure self im weight,"[-0.33909604, 1.118972, 0.94716614, 0.09483159...","[0.21722113, -0.19138905, 1.3190721, 0.2770528...","[-0.12187491, 0.92758286, 2.2662382, 0.3718843...",-0.108543,0.000000,162,"[-0.12187491357326508, 0.927582859992981, 2.26..."


In [134]:
,#Split and Train the sentences
X = data[['cosine_sim','jaccard_sim']]
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)

In [135]:
X_train_arr = np.vstack(X_train.values)
X_test_arr = np.vstack(X_test.values)

In [138]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_arr)
X_test_scaled = scaler.transform(X_test_arr)
# Create an SVM model with a specified kernel (e.g., 'linear', 'poly', 'rbf', or 'sigmoid')
clf = SVC(kernel='rbf',degree= 2)

# Fit the model to the training data
clf.fit(X_train_scaled, y_train)

In [139]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89       590
           1       0.82      0.40      0.54       210

    accuracy                           0.82       800
   macro avg       0.82      0.68      0.71       800
weighted avg       0.82      0.82      0.80       800



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': np.logspace(-4, 4, 10),
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'] + list(np.logspace(-4, 4, 5)),
    'coef0': list(np.linspace(-1, 1, 5)),
}

# param_grid = {
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree': [2, 3, 4],
# }


# Create the SVM model
svm = SVC()

# Perform grid search with cross-validation
grid_search = GridSearchCV(svm, param_grid, scoring='accuracy', cv=2, verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Show the best parameters found
print("Best Parameters:")
print(grid_search.best_params_)

# Evaluate the model with the best hyperparameters on the test data
y_pred = grid_search.best_estimator_.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Fitting 2 folds for each of 4200 candidates, totalling 8400 fits
