In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('dataset/train.csv')
df.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,4,Powermax Rubber Factory,Co. One,0
4,5,Tress A/S,Longyou Industries Park Zhejiang,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df[['name_1', 'name_2']],df['is_duplicate'], 
    test_size=0.25, random_state=42)

In [4]:
X_train.shape

(373364, 2)

In [5]:
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)

In [6]:
X_train_1 = text_transformer.fit_transform(X_train['name_1'])

In [7]:
X_train_2 = text_transformer.transform(X_train['name_2'])

In [8]:
X_train_1.shape

(373364, 43708)

In [9]:
X_train_2.shape

(373364, 43708)

In [10]:
from scipy.sparse import hstack

X_train_concat = hstack([X_train_1,X_train_2])

In [11]:
X_train_concat.shape

(373364, 87416)

In [12]:
logreg = LogisticRegression(C=5e1, solver='lbfgs', random_state=42, n_jobs=4)

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
cv_results = cross_val_score(logreg, X_train_concat, y_train.values, cv=skf, scoring='f1')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [15]:
cv_results

array([0.84765625, 0.85330777, 0.82912621, 0.82792527, 0.8460039 ])

In [16]:
cv_results.mean()

0.8408038797380601

In [17]:
X_test_1 = text_transformer.transform(X_test['name_1'])
X_test_2 = text_transformer.transform(X_test['name_2'])
X_test_concat = hstack([X_test_1,X_test_2])

In [18]:
logreg.fit(X_train_concat, y_train.values)

In [19]:
preds = logreg.predict(X_test_concat)

In [20]:
from sklearn.metrics import f1_score

f1_score(y_test.values, preds)

0.8521424260712132

In [25]:
import numpy as np

np.unique(preds, return_counts=True)

(array([0, 1]), array([123685,    770]))

In [32]:
from sklearn.svm import OneClassSVM

oneclass_svm = OneClassSVM(gamma='auto')
oneclass_svm.fit(X_train_concat)

In [None]:
preds = oneclass_svm.predict(X_test_concat)


In [None]:
f1_score(y_test.values, preds)