# Prototype to use topic modeling as a means for feature generation


Goal:
    * prototype topic modeling (LDA) as a means for feature generation
    * run feature matrix through some classification models and evaluate results


References:

* [Traditional Methods for Text Data](https://towardsdatascience.com/understanding-feature-engineering-part-3-traditional-methods-for-text-data-f6f7d70acd41)
* [An overview of topics extraction in Python with LDA](https://towardsdatascience.com/the-complete-guide-for-topics-extraction-in-python-a6aaa6cedbbc)
* [gensim](https://radimrehurek.com/gensim/)
* [pyLDAvis](https://pyldavis.readthedocs.io/en/latest/)

In [1]:
from gensim import corpora, models
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim
from sklearn.feature_extraction.text import CountVectorizer
import re
import pprint

pd.set_option('display.max_colwidth', -1)

%matplotlib inline

In [2]:
# around 100k entries
INFILE="dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-preprocessed-tiny.csv"
TOPIC=20


In [3]:
df = pd.read_csv(INFILE, parse_dates=["review_date"])

In [4]:
review_body_df = df["review_body"]
Y = df["star_rating"]

### do more data cleaning - words with mixed letters and numbers are probably referring to model number - not really useful for our anlysis

In [5]:
review_body_df.head(10)

0    serves purpose loud whoever sitting seat attached                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
1    works really well samsung s6 otterbox defender case charges no problem                                                                                                                                                                                                                                                                                                                   

In [6]:
def clean_mixed_words(x):
    # remove mixed words
    x = re.sub(r'\s+([a-z]+[\d]+[\w]*|[\d]+[a-z]+[\w]*)', '', x)
    # remove numbers
    x = re.sub(r'\s(\d+)', '', x)
    return x
review_body_df = review_body_df.apply(clean_mixed_words)
review_body_df.head(10)

0    serves purpose loud whoever sitting seat attached                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
1    works really well samsung otterbox defender case charges no problem                                                                                                                                                                                                                                                                                                                               

In [7]:
# create BoW features
vectorizer = CountVectorizer()
feature_matrix = vectorizer.fit_transform(review_body_df)
feature_array = feature_matrix.toarray()
vocab = vectorizer.get_feature_names()

In [8]:
print(vocab)



In [9]:
dictionary_LDA = dict(enumerate(vocab))
print(dictionary_LDA)



In [10]:
len(dictionary_LDA.keys())

18926

# Couldn't Quite get gensim to work

In [11]:
count = 0
for d, doc in enumerate(feature_array):
    print(f'd: {d} doc: {doc}')
    count += 1
    if count > 6:
        break

d: 0 doc: [0 0 0 ... 0 0 0]
d: 1 doc: [0 0 0 ... 0 0 0]
d: 2 doc: [0 0 0 ... 0 0 0]
d: 3 doc: [0 0 0 ... 0 0 0]
d: 4 doc: [0 0 0 ... 0 0 0]
d: 5 doc: [0 0 0 ... 0 0 0]
d: 6 doc: [0 0 0 ... 0 0 0]


In [12]:
# num_topics = 20
# lda_model = models.LdaModel(feature_array, 
#                             num_topics=num_topics, \
# #                                   id2word=dictionary_LDA, \
#                                   passes=4)

# Trying sklearn LatenDirichletAllocation

In [13]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, max_iter=10000, random_state=0)
dt_matrix = lda.fit_transform(feature_matrix)


KeyboardInterrupt: 

In [None]:
features = pd.DataFrame(dt_matrix, ignore_index=True)
feature_subset = pd.DataFrame(dt_matrix, columns=['T1', 'T2', 'T3'])
feature_subset
features.to_csv('lda_dt-preprocessed.csv', index=False)

In [None]:
# look at topics
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)
    print()
tt_df = pd.DataFrame(tt_matrix, ignore_index=True)
tt_df.to_csv('lda_tt-preprocessed.csv', index=False)

# Let's run KNN

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, Y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_jobs=-1)
neigh.fit(X_train, y_train)
y_predict = neigh.predict(X_test, y_test)



In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_predict)
pprint(report)
report_df = pd.DataFrame(report, ignore_index=True)
report_df.to_csv('lda_out.csv', index=False)