In [1]:
import numpy as np
import pandas as pd

In [2]:
test_path = '../input/nlp-getting-started/test.csv'
train_path = '../input/nlp-getting-started/train.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [3]:
train_df.head()

In [4]:
test_df.head()

In [5]:
train_df[train_df['target']==0]['text']

In [6]:
type(train_df[train_df['target']==0]['text'])

In [7]:
train_data = train_df['text']
test_data = test_df['text']
print(train_data.head())
print(test_data.tail())

In [8]:
print(len(train_data))
print(len(test_data))

data = train_data.append(test_data)
print(len(data))

In [9]:
data.values

In [10]:
data = data.values
data.shape

In [11]:
data[-1]


In [12]:
import re
import string

punctuation = string.punctuation
temp = re.sub(r'[%s]' % (punctuation), ' ', data[-1])
temp

In [13]:
temp = re.sub(r'[ ]+', r' ', temp)
temp

In [14]:
temp.lower()

In [15]:
temp = re.split(r' ', temp)
temp

In [16]:
type(temp)

In [17]:
class TfidfVectorizer(object):
    def __init__(self):
        self.idf = None
    
    def _regularizer(self, sentence):
        punctuation = string.punctuation
        ret = re.sub(r'[%s]' % (punctuation), r' ', sentence)
        ret = re.sub(r'[ ]+', r' ', ret)
        ret = ret.lower()
        ret = re.split(r' ', ret)
        res = [i for i in ret if len(i)>=2]
        return res
    
    def _vectorizer(self, text_arr):
        if not isinstance(text_arr[0], list):
            text_arr = [text_arr]
        matrix = np.zeros((len(text_arr), len(self.feature_names)))
        for idx, single_arr in enumerate(text_arr):
            for word in single_arr:
                try:
                    word_idx = self.feature_names.index(word)
                except Exception:
                    word_idx = -1
                if word_idx >= 0:
                    matrix[idx][word_idx] += 1
        word_num_per_document = np.sum(matrix, axis=1, keepdims=True)
        
        tf = matrix / word_num_per_document
        if self.idf is None:
            one_zero_matrix = np.zeros_like(matrix)
            one_zero_matrix[matrix > 0] = 1.0
            times_per_word = np.sum(one_zero_matrix, axis=0, keepdims=True)
            idf = np.log(len(text_arr) / (1.0 + times_per_word))
            self.idf = idf
        sentence_vectors = tf * self.idf
        return sentence_vectors
        
    def _achieve(self, data):
        text_arrs = []
        for text in data:
            text_arr = self._regularizer(text)
            text_arrs.append(text_arr)
        feature_names = []
        for arr in text_arrs:
            feature_names.extend(arr)
        self.feature_names = list(set(feature_names))
        sentence_vectors = self._vectorizer(text_arrs)
        self.text_vectors = sentence_vectors
        return sentence_vectors
    
    def get_feature_names(self):
        return self.feature_names
    
    def fit_transform(self, data):
        self.text_data = data
        return self._achieve(data)
    
    def transform(self, data):
        text_arr = self._regularizer(data)
        vector = self._vectorizer(text_arr)
        return vector
    
    def calculate_cosine_similarity(self, targ_vector, base_vectors):
        numerator = np.sum(base_vectors * targ_vector, axis=1)
        denominator = np.sqrt(np.sum(np.square(base_vectors), axis=1)) * np.sqrt(np.sum(np.square(targ_vector)))
        cosine_similarity = numerator / denominator
        idx_similarity_list = [(idx, similarity) for idx, similarity in enumerate(cosine_similarity)]
        return sorted(idx_similarity_list, key=lambda x: x[1], reverse=True)
        
    def get_similar_sentences(self, k, input_text):
        '''
        the number k represents the k most similar sentences
        '''
        ans = []
        input_vector = self.transform(input_text)
        # (index, similarity degree[from high to low])
        index_similarity_list = self.calculate_cosine_similarity(input_vector, self.text_vectors)
        for x in index_similarity_list[0: k]:
            text = self.text_data[x[0]]
            ans.append([text, x[1]])
            
        return ans

In [18]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)

In [19]:
X

In [20]:
X.shape

In [21]:
np.sum(X)

In [22]:
vectorizer.get_feature_names()[0:10]

In [23]:
temp = 'This is an example'
temp = vectorizer.transform(temp)
print(temp, np.sum(temp))
temp.shape

In [24]:
temp = 'This is an example'
vectorizer.get_similar_sentences(10, temp)