In [1]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
from termcolor import colored
import numpy as np

In [2]:
def remove_stopwords(sentence):
    english_stopwords = stopwords.words("english")
    return " ".join(i for i in sentence.split(" ") if i not in english_stopwords)


In [3]:
def stemming(sentence):
    stemmer = PorterStemmer()
    return " ".join(stemmer.stem(word) for word in sentence.split(" "))

In [4]:
def pre_process(data):
    """
    get data from the csv file
    clean the data - lowercase, puncutations, html tags etc.
    normalization - stemming
    :return: title1 (sentences) and labels (sentiment)
    """
    print(colored("1. Preprocessing Data", "yellow"))
   
    REPLACE_NO_SPACE = re.compile("[_.;:!\'?,\"\(\)\[\]<>]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    # make lower case and remove puncutations
    print(colored("\t1.1 Cleaning data...", "yellow"), end="", flush=True)
    title1 = [REPLACE_NO_SPACE.sub("", line.lower()) for line in data["title1_en"]]
    title1 = [REPLACE_WITH_SPACE.sub(" ", line) for line in title1]

    title2 = [REPLACE_NO_SPACE.sub("", line.lower()) for line in data["title2_en"]]
    title2 = [REPLACE_WITH_SPACE.sub(" ", line) for line in title2]
    print(colored(" [Done]", "green"))

    print(colored("\t1.2 Removing stopwords...", "yellow"), end="", flush=True)
    title1 = [remove_stopwords(i) for i in title1]
    title2 = [remove_stopwords(i) for i in title2]

    print(colored(" [Done]", "green"))

    print(colored("\t1.3 Stemming...", "yellow"), end="", flush=True)
    title1 = [stemming(i) for i in title1]
    title2 = [stemming(i) for i in title2]

    print(colored(" [Done]", "green"))
#     labels = data["label"]

    return pd.DataFrame({'title1':title1, 'title2':title2})
#     return pd.DataFrame({'title1':title1, 'title2':title2, 'label':labels})

In [5]:
data = pd.read_csv("test.csv", sep=",")
# data['label'] = 'unrelated'
data = pre_process(data)

[33m1. Preprocessing Data[0m
[33m	1.1 Cleaning data...[0m[32m [Done][0m
[33m	1.2 Removing stopwords...[0m[32m [Done][0m
[33m	1.3 Stemming...[0m[32m [Done][0m


In [6]:
data.shape

(64110, 2)

In [7]:
data.head()

Unnamed: 0,title1,title2
0,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milk huo jianhua seen ...
1,nasa reveal fact ufo wreckag found moon,ufo found yuancun jiaocheng counti shanxi shoc...
2,hollow tomato load hormon,li chenfan bingb home photo netizen call luxur...
3,ang pavilion geoshui accur matrimoni match mat...,master one eight charact presumpt marriag soon...
4,50 year old bu bu blow 8 year old child rumor ...,joe johnson disgruntl time order myth


In [8]:
data['title1'].replace('', np.nan, inplace=True)
data['title2'].replace('', np.nan, inplace=True)
data.isnull().sum()
data = data.dropna()
data.isnull().sum()

title1    0
title2    0
dtype: int64

In [9]:
data.reset_index(inplace=True)
data = data.drop(['index'],axis=1)
data.columns

Index(['title1', 'title2'], dtype='object')

In [10]:
def common_words(str1, str2):
    a = set(str1.split()) 
    b = set(str2.split())
    return len(a.intersection(b))

In [11]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [12]:
import re
import math
from collections import Counter


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)


def get_cs(content_a, content_b):
    text1 = content_a
    text2 = content_b

    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine_result = get_cosine(vector1, vector2)
    return cosine_result


In [13]:
js = get_jaccard_sim(data['title1'][9],data['title2'][9])

cs = get_cs(data['title1'][9],data['title2'][9])

In [14]:
js =[]
cs =[]
common_count = []


for i in range(len(data)):
    t1 = data['title1'][i]
    t2 = data['title2'][i]
    js.append(get_jaccard_sim(t1,t2))
    cs.append(get_cs(t1,t2))
    common_count.append(common_words(t1,t2))

In [15]:
data['js'] = pd.DataFrame(js)
data.head()

Unnamed: 0,title1,title2,js
0,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milk huo jianhua seen ...,0.0
1,nasa reveal fact ufo wreckag found moon,ufo found yuancun jiaocheng counti shanxi shoc...,0.117647
2,hollow tomato load hormon,li chenfan bingb home photo netizen call luxur...,0.0
3,ang pavilion geoshui accur matrimoni match mat...,master one eight charact presumpt marriag soon...,0.0
4,50 year old bu bu blow 8 year old child rumor ...,joe johnson disgruntl time order myth,0.0625


In [16]:
data['cs'] = pd.DataFrame(cs)
data.head()

Unnamed: 0,title1,title2,js,cs
0,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milk huo jianhua seen ...,0.0,0.0
1,nasa reveal fact ufo wreckag found moon,ufo found yuancun jiaocheng counti shanxi shoc...,0.117647,0.218218
2,hollow tomato load hormon,li chenfan bingb home photo netizen call luxur...,0.0,0.0
3,ang pavilion geoshui accur matrimoni match mat...,master one eight charact presumpt marriag soon...,0.0,0.0
4,50 year old bu bu blow 8 year old child rumor ...,joe johnson disgruntl time order myth,0.0625,0.091287


In [17]:
data['common_count'] = pd.DataFrame(common_count)
data.head()

Unnamed: 0,title1,title2,js,cs,common_count
0,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milk huo jianhua seen ...,0.0,0.0,0
1,nasa reveal fact ufo wreckag found moon,ufo found yuancun jiaocheng counti shanxi shoc...,0.117647,0.218218,2
2,hollow tomato load hormon,li chenfan bingb home photo netizen call luxur...,0.0,0.0,0
3,ang pavilion geoshui accur matrimoni match mat...,master one eight charact presumpt marriag soon...,0.0,0.0,0
4,50 year old bu bu blow 8 year old child rumor ...,joe johnson disgruntl time order myth,0.0625,0.091287,1


In [18]:
data["js"].mean()

0.14854121375602256

In [19]:
data["cs"].mean()

0.24557028536636247

In [20]:
data['common_count'].mean()

2.260969940880091

In [21]:
data.to_csv("test_proc.csv",index=False)