In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import numpy as np

stop_words = stopwords.words('english')
stop_words.append("game")

wordnet_lemmatizer = WordNetLemmatizer()
data = pd.read_csv("../data/game_data_all.csv", header=0)

review = data["review"].values.tolist()
helpfuls = data["marked as helpful"].values.tolist()

pattern = r'\w[\w\'-]*\w'

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
init_reviews = []
init_helpfuls = []

for doc in review:
    doc = doc.lower()
    tokens = nltk.regexp_tokenize(doc, pattern)
    tagged_tokens= nltk.pos_tag(tokens)
    lemmatized_words=[wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(tag)) \
          for (word, tag) in tagged_tokens \
          if word not in stop_words and word not in string.punctuation]
    temp_str = ""
    for item in lemmatized_words:
        temp_str = temp_str + " " + item
    init_reviews.append(temp_str[1:])
    
for line in helpfuls:
    if line=="No":
        init_helpfuls.append("0")
    else:
        init_helpfuls.append(line)

df = pd.DataFrame()
df["user_name"] = data["names"].values.tolist()
df["user_product"] = data["products#"].values.tolist()
df["helpful"] = init_helpfuls
df["funny"] = data["marked as funny"].values.tolist()
df["post_date"] = data["post_date"].values.tolist()
df["recommend_or_not"] = data["Recommend?"].values.tolist()
df["game_time"] = data["times on record"].values.tolist()
df["review"] = init_reviews

# df.to_csv('../data/tokened_normed_review.csv',index=True)
print("1")

1


In [2]:
indx = df[df.review==''].index.tolist()
df1=df.drop(df.index[indx])
df1.drop_duplicates(subset ="user_name", 
                     keep = False, inplace = True)
# df1.to_csv('tokened_normed_review_v2.csv',index=False)

In [18]:


### TF-IDF ###



In [27]:
# This function is to tokenalize normal review before we do TF-IDF #
def get_doc_tokens(doc):
    tokens=[token.strip() \
            for token in nltk.word_tokenize(doc) if token.strip() not in stop_words and\
               token.strip() not in string.punctuation]
    # create token count dictionary
    token_count={token:tokens.count(token) for token in set(tokens)}
    return token_count


# This function is to get TF-IDF matrix #
def get_tf_idf(reviews):
    docs_tokens={idx:get_doc_tokens(doc) for idx,doc in enumerate(reviews)}

    # since we have a small corpus, we can use dataframe to get document-term matrix
    dtm=pd.DataFrame.from_dict(docs_tokens, orient="index" )
    dtm=dtm.fillna(0)
    # convert dtm to numpy arrays
    tf=dtm.values

    # sum the value of each row
    doc_len=tf.sum(axis=1)

    # divide dtm matrix by the doc length matrix
    tf=np.divide(tf, doc_len[:,None])

    # get document freqent
    df=np.where(tf>0,1,0)

    # get idf
    smoothed_idf=np.log(np.divide(len(reviews)+1, np.sum(df, axis=0)+1))+1

    # get tf-idf
    smoothed_tf_idf=tf*smoothed_idf
    
    return smoothed_tf_idf

Smoothed TF-IDF Matrix
[[0.04916564 0.13178652 0.04547206 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.01935959 0.01935959 0.01935959]]


In [30]:
from scipy.spatial import distance

# This function is to get the most similarity review given a review_id #
def find_similar_doc(doc_id, smoothed_tf_idf):
    similarity=1-distance.squareform(distance.pdist(smoothed_tf_idf, 'cosine'))
    
    # find top doc similar to first one
    best_matching_doc_id = np.argsort(similarity)[:,::-1][doc_id,0:2][1]
    similarity = similarity[doc_id,best_matching_doc_id]  
    return best_matching_doc_id, similarity

In [31]:
if __name__ == "__main__":
    reviews = df1["review"].values.tolist()
    tf_idf = get_tf_idf(reviews)
    print("Smoothed TF-IDF Matrix")
    print(tf_idf)
    
    print(find_similar_doc(1,tf_idf))

Smoothed TF-IDF Matrix
[[0.04916564 0.13178652 0.04547206 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.01935959 0.01935959 0.01935959]]
(840, 0.20090364137107763)
