In [2]:
path_to_zip_file = "/kaggle/input/quora-question-pairs/train.csv.zip"
directory_to_extract_to = "./"
import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
  zip_ref.extractall(directory_to_extract_to)

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup

In [5]:
df = pd.read_csv("/kaggle/working/train.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [25]:
def preprocess(r):
    r = str(r).lower().strip()
    r = r.replace('%','percent')
    r = r.replace('$','dollar')
    r = r.replace('₹','rupee')
    r = r.replace('€','percent')
    r = r.replace('@','percent')

    r = r.replace('[math]','')
    def simplify_number(text):
        def repl(match):
            num = int(match.group())
            if num >= 1_000_000_000:
                return f'{num//1_000_000_000}b'
            elif num >= 1_000_000:
                return f'{num//1_000_000}m'
            elif num >= 1_000:
                return f'{num//1_000}k'
            return str(num)
        
        return re.sub(r'\b\d{4,}\b', repl, text)
        
    r = simplify_number(r)

    contractions = { 
        "ain't": "am not",
        "aren't": "are not",
        "can't": "can not",
        "can't've": "can not have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so as",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
        }

    r_decontracted = []

    for word in r.split():
        if word in contractions:
            word = contractions[word]

        r_decontracted.append(word)
    r = ' '.join(r_decontracted)
    r = r.replace("'ve'","have")
    r = r.replace("n't","not")
    r = r.replace("'re","are")
    r = r.replace("'ll'","will")


    r = BeautifulSoup(r, "html.parser").get_text()

    r = re.sub(r'\W+', ' ', r).strip()

    return r

In [28]:
df['question1'] = df['question1'] .apply(preprocess)
df['question2'] = df['question2'] .apply(preprocess)

In [32]:
df['q1_len'] = df['question1'].str.len()
df['q2_len'] = df['question2'].str.len()
df['q1_num_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
df['q2_num_words'] = df['question2'].apply(lambda row: len(row.split(" ")))

In [34]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(),row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(),row['question2'].split(" ")))
    return len(w1 & w2)

df['common_words'] = df.apply(common_words,axis=1)

In [35]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return (len(w1) + len(w2))

df['total_words'] = df.apply(total_words,axis=1)

In [38]:
df['word_share'] = round(df['common_words']/df['total_words'],2)

In [42]:
from nltk.corpus import stopwords

def fetch_token_features(row):
    q1 = row['question1']
    q2 = row['question2']

    con = 0.00001

    stop_words = stopwords.words("english")

    token_features = [0.0]*8
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    q1_words = set([word for word in q1_tokens if word not in stop_words])
    q2_words = set([word for word in q2_tokens if word not in stop_words])
    
    q1_stops = set([word for word in q1_tokens if word in stop_words])
    q2_stops = set([word for word in q2_tokens if word in stop_words])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

    token_features[0] = common_word_count / (min(len(q1_words),len(q2_words)) + con)
    token_features[1] = common_word_count / (max(len(q1_words),len(q2_words)) + con)
    token_features[2] = common_stop_count / (min(len(q1_words),len(q2_words)) + con)
    token_features[3] = common_stop_count / (max(len(q1_words),len(q2_words)) + con)
    token_features[4] = common_token_count / (min(len(q1_words),len(q2_words)) + con)
    token_features[5] = common_token_count / (max(len(q1_words),len(q2_words)) + con)
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])

    return token_features

In [43]:
token_features = df.apply(fetch_token_features,axis=1)
df['cwc_min'] = list(map(lambda x: x[0],token_features))
df['cwc_max'] = list(map(lambda x: x[1],token_features))
df['csc_min'] = list(map(lambda x: x[2],token_features))
df['csc_min'] = list(map(lambda x: x[3],token_features))
df['ctc_min'] = list(map(lambda x: x[4],token_features))
df['ctc_min'] = list(map(lambda x: x[5],token_features))
df['last_word_eq'] = list(map(lambda x: x[6],token_features))
df['first_word_eq'] = list(map(lambda x: x[7],token_features))

In [45]:
!pip install distance

Collecting distance
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25l[?25hdone
  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16256 sha256=4efef8b8830a334fe26cb269254bc5615822e651e3db28043500ba95217495f6
  Stored in directory: /root/.cache/pip/wheels/fb/cd/9c/3ab5d666e3bcacc58900b10959edd3816cc9557c7337986322
Successfully built distance
Installing collected packages: distance
Successfully installed distance-0.1.3


In [60]:
import distance

def fetch_length_features(row):
    q1 = row['question1']
    q2 = row['question2']

    length_features = [0.0] * 3

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    # Fix logic: should be `or len(q2_tokens) == 0`
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2

    strs = list(distance.lcsubstrings(q1, q2))
    if len(strs) > 0:
        length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    else:
        length_features[2] = 0.0

    return length_features


In [61]:
length_features = df.apply(fetch_length_features,axis=1)

df['abs_len_diff'] = list(map(lambda x: x[0],length_features))
df['mean_len'] = list(map(lambda x:x[1],length_features))
df['longest_subtr_ratio'] = list(map(lambda x: x[2],length_features))

In [62]:
df.to_csv('preprocessed1.csv', index=False)