In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('train2electricboogaloo.csv')

train.info()

In [None]:
train.describe()

In [None]:
train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [None]:
train.head()

In [None]:
len(train)

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
train.hist(figsize=(20,15))

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["HelpfulRatio", "Score", "Time",
              "FoundHelpful", "HelpfulVotes", "NumUserReviews", "NumProdReviews"]

In [None]:
scatter_matrix(train[attributes], figsize=(15, 15))

In [None]:
import string
import nltk
from nltk.corpus import stopwords

In [None]:
def text_process(mess):
    nopunc = mess.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words("english"))
    nopunc2 = [word for word in nopunc.split() if word.lower() not in stop_words]
#     nopunc2 = ' '.join(nopunc2)
    return nopunc2

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(train, test_size = 0.2, random_state = 42, stratify = train['Score'])

In [None]:
train_set.hist(figsize=(20,15))

In [None]:
test_set.hist(figsize=(20,15))

In [None]:
len(train_set)

In [None]:
len(test_set)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
low_train = train.loc[train['Score'] == 1]
high_train = train.loc[train['Score'] == 5]
# df.loc[df['column_name'] == some_value]

In [None]:
low_train.head()

In [None]:
high_train.head()

In [None]:
vect = CountVectorizer(max_features=10000, max_df=.15, analyzer=text_process)
X = vect.fit_transform(low_train['SumTxt'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=0)
# We build the model and transform the data in one step
# Computing transform takes some time,
# and we can save time by doing both at once
document_topics = lda.fit_transform(X)

In [None]:
print("lda.components_.shape: {}".format(lda.components_.shape))

In [None]:
# for each topic (a row in the components_), sort the features (ascending).
# Invert rows with [:, ::-1] to make sorting descending
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
# get the feature names from the vectorizer:
feature_names = np.array(vect.get_feature_names())

In [None]:
import mglearn as mglearn

In [None]:
# Print out the 10 topics:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
low_bow_transformer = CountVectorizer(analyzer=text_process, max_features=10000, max_df=.15).fit(train['SumTxt'])

In [None]:
print(len(low_bow_transformer.vocabulary_))

In [None]:
low_bow_transformed = low_bow_transformer.transform(train['SumTxt'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=0)

In [None]:
document_topics = lda.fit_transform(low_bow_transformed)

In [None]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
# get the feature names from the vectorizer:
feature_names = np.array(low_bow_transformer.get_feature_names())

In [None]:
# Print out the 10 topics:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

In [None]:
sumtxt_low_bow = low_bow_transformer.transform(train_set['SumTxt'])

In [None]:
print(len(bow_transformer.vocabulary_))

In [None]:
bow_transformer.vocabulary_

In [None]:
sumtxt_bow = bow_transformer.transform(train_set['SumTxt'])