In [28]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [29]:
# Download stopwords and initialize stemmer
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/viketan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Preprocess the data and split into training and testing sets
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])
    
    # Perform stemming
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    return text

In [31]:
data = pd.read_csv('train.csv')

In [32]:
# Remove rows with missing values in 'question1' or 'question2' columns
data = data.dropna(subset=['question1', 'question2'])

In [33]:
# Preprocess the data and split into training and testing sets
questions_train = data.apply(lambda row: preprocess_text(row['question1'] + ' ' + row['question2']), axis=1)
labels_train = data['is_duplicate']

In [11]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit vectorizer on training data and transform both training and testing data
X_train = vectorizer.fit_transform(questions_train)

In [12]:
# Initialize logistic regression classifier
classifier = LogisticRegression()

# Train the classifier
classifier.fit(X_train, labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Example testing data
questions_test = [
    "What is the step by step guide to invest in shares in India?",
    "How can I increase the speed of my internet connection?",
    # ...
]

In [14]:
# Transform the testing data using the fitted vectorizer
X_test = vectorizer.transform(questions_test)

# Predict similarity on the testing set
predictions = classifier.predict(X_test)

# Example: Print the predicted similarity for each test question
for question, prediction in zip(questions_test, predictions):
    similarity = "Similar" if prediction == 1 else "Not Similar"
    print(f"Question: {question}\nSimilarity: {similarity}\n")

Question: What is the step by step guide to invest in shares in India?
Similarity: Not Similar

Question: How can I increase the speed of my internet connection?
Similarity: Similar

