### Import required libraries

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

### Import data

In [2]:
data = pd.read_csv('Text_Similarity_Dataset.csv')
data.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


### Text pre-processing

* Function to remove punctuation, stopwords and lemmatization

In [3]:
def remove_punctuation(text):
  return [word for word in text if word.isalpha()]

def remove_punctuation_from_word(text):
  token = []
  for word in text:
    if word[-1].isalpha():
      token.append(word)
    else:
      token.append(word[:-1]) 
  return token

stop_words = stopwords.words('english')
def remove_stopword(text):
  return [w for w in text if not w in stop_words]
    
def lemmatizing(text):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(word) for word in text]

* Convert to lowercase -> tokenization -> removing punctuation -> removing stop words -> Lemmatization

In [4]:
def preprocessText(raw_text):
    processed_text = data[raw_text]

    print('Converting to lower case...')
    processed_text = [text.strip().lower() for text in processed_text]
    print('Done')

    print('Tokenizing...')
    processed_text = [word_tokenize(text) for text in processed_text]
    print('Done')

    print('Removing punctuation...')
    processed_text = [remove_punctuation(text) for text in processed_text]
    processed_text = [remove_punctuation_from_word(text) for text in processed_text]
    print('Done')

    print('Removing Stop words...')
    processed_text = [remove_stopword(text) for text in processed_text]
    print('Done')

    print('Lemmatizing...')
    processed_text = [lemmatizing(text) for text in processed_text]
    
    processed_text = [' '.join(text) for text in processed_text]
    
    print('Text pre-processing Done, ', raw_text, '\n')
    return processed_text

In [5]:
data['preprocessedText1'] = preprocessText('text1')
data['preprocessedText2'] = preprocessText('text2')

Converting to lower case...
Done
Tokenizing...
Done
Removing punctuation...
Done
Removing Stop words...
Done
Lemmatizing...
Text pre-processing Done,  text1 

Converting to lower case...
Done
Tokenizing...
Done
Removing punctuation...
Done
Removing Stop words...
Done
Lemmatizing...
Text pre-processing Done,  text2 



In [6]:
data.head()

Unnamed: 0,Unique_ID,text1,text2,preprocessedText1,preprocessedText2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...,savvy searcher fail spot ad internet search en...,newcastle bolton kieron dyer smashed home winn...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...,million miss net uk population still without i...,nasdaq planning share sale owner nasdaq stock ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...,young debut cut short ginepri donald young fir...,ruddock back yapp credential wale coach mike r...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...,diageo buy u wine firm diageo world biggest sp...,mci share climb takeover bid share u phone com...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...,careful code new european directive could put ...,medium gadget get moving device let people car...


### Modelling

In [9]:
# Tf-idf
vectorizer = TfidfVectorizer(max_df = 0.7, stop_words='english')

In [10]:
# SVD
svd_model = TruncatedSVD(n_components=500, random_state=42)

In [11]:
# Building the pipeline
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

preprocessing_model = svd_transformer.fit(data['preprocessedText1'])
svd_matrix_text1 = preprocessing_model.transform(data['preprocessedText1'])

In [12]:
svd_matrix_text1

array([[ 0.13038022, -0.05806473,  0.10923112, ...,  0.00219219,
         0.02861354, -0.00092971],
       [ 0.21136546, -0.04948473,  0.14078525, ..., -0.00179285,
        -0.00678844, -0.02985562],
       [ 0.11871561, -0.09289399, -0.09025369, ..., -0.03254321,
         0.01996912, -0.01782526],
       ...,
       [ 0.20019237, -0.11674107,  0.17087802, ..., -0.0148589 ,
         0.06638511,  0.0343965 ],
       [ 0.10248629, -0.08296658, -0.12404679, ...,  0.00418456,
        -0.0093032 , -0.00147784],
       [ 0.13477473,  0.04053834,  0.02371042, ..., -0.00181597,
         0.04251643, -0.01510969]])

In [13]:
svd_matrix_text2 = preprocessing_model.transform(data['preprocessedText2'])

In [14]:
svd_matrix_text2

array([[ 0.0952887 , -0.06371367, -0.06538803, ..., -0.01451218,
         0.03646627, -0.02733624],
       [ 0.11029573, -0.0364291 ,  0.1113235 , ..., -0.01916614,
        -0.00723677, -0.00316568],
       [ 0.13228073, -0.08848359, -0.11273877, ..., -0.01340538,
        -0.01441074,  0.00462056],
       ...,
       [ 0.15917015, -0.02948292, -0.05557423, ..., -0.00390803,
        -0.01013385, -0.02614684],
       [ 0.11200663, -0.01919764,  0.07753895, ...,  0.00991106,
        -0.01260567,  0.00884597],
       [ 0.10538562, -0.04802579, -0.0397895 , ...,  0.01014282,
        -0.00649919,  0.01959902]])

In [15]:
# Calculating cosine similarity
distance_matrix = cosine_similarity(svd_matrix_text1, svd_matrix_text2)

In [16]:
distance_matrix = pd.DataFrame(distance_matrix)
x = distance_matrix.values
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
x_scaled = min_max_scaler.fit_transform(x)
distance_matrix = pd.DataFrame(x_scaled)
distance_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4013,4014,4015,4016,4017,4018,4019,4020,4021,4022
0,0.094483,0.037588,0.046469,0.018,0.045913,0.073277,0.057745,0.057094,0.051648,0.054302,...,0.028868,0.056583,0.05541,0.053447,0.036963,0.047756,0.054306,0.045959,0.045832,0.042054
1,0.094294,0.081,0.074806,0.052279,0.081849,0.057937,0.056372,0.044149,0.081578,0.12694,...,0.052213,0.086241,0.084513,0.082841,0.045273,0.07669,0.090515,0.071822,0.052665,0.094181
2,0.107111,0.030294,0.097587,0.026388,0.068697,0.131733,0.088121,0.016347,0.070694,0.070338,...,0.336833,0.054534,0.087848,0.046271,0.060678,0.045976,0.046595,0.108936,0.062854,0.068017
3,0.075733,0.09513,0.054825,0.073007,0.051497,0.215482,0.044662,0.016908,0.047583,0.171874,...,0.033663,0.045393,0.068774,0.041549,0.052339,0.128138,0.050941,0.072003,0.081286,0.055311
4,0.085867,0.066192,0.068789,0.037187,0.093613,0.091134,0.087874,0.068217,0.065856,0.071569,...,0.026047,0.104515,0.063861,0.090001,0.056884,0.059392,0.089292,0.07311,0.060667,0.079991


In [17]:
result = pd.DataFrame(np.diag(distance_matrix), index=[distance_matrix.index, distance_matrix.columns])
result = result.reset_index()
result.drop('level_0', axis=1, inplace=True)
result.columns = ['Unique_ID', 'Similarity_Score']
result.head()

Unnamed: 0,Unique_ID,Similarity_Score
0,0,0.094483
1,1,0.081
2,2,0.097587
3,3,0.073007
4,4,0.093613


In [18]:
result.to_csv('scores.csv', index=False)

### NEXT: 
* To try Run the SVD Model independently and explore the model attributes : singular_values_, explained_variance_, explained_variance_ratio_, singular_values_
* Try different model other than cosine similarity, also deep learning model 