In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
import itertools
import math
import json
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from sklearn import metrics
from functools import partial
from __future__ import print_function


## Word Cloud

In [None]:
def word_cloud(filename):
    text=pd.read_csv('filename',header=None,index_col=0)[1:]
    temp=pd.DataFrame(text[1])
    temp.dropna(inplace=True)
    dict_=temp.to_dict()[1]
    for key in dict_.keys():
        dict_[key] = int(dict_[key])
    
    plt.figure(figsize=(20,20))
    wordcloud = WordCloud(max_font_size=40, max_words=500,background_color="white").generate_from_frequencies(dict_)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    
    return dict_

## Basic description

In [None]:
class ReviewsDataset():
    def __init__(self, **kwargs):
        super(ReviewsDataset, self).__init__(**kwargs)
        # This will be initialised by the load method with all the dataset examples
        self.X = None
        self.X_original = None
        # This will be initialised by the load method with all the dataset classes
        self.y = None 
        
#     def preprocess(self, remove_stopwords=False):
#         # avoid tokenization: the input data are already tokenized
#         tfidf = TfidfVectorizer(
#             stop_words="english" if remove_stopwords else None,
#             min_df=2
#         )
        
#         self.X = tfidf.fit_transform(self.X)
        
        
    def load(self, filename,col='text'):
        """
        Loads the dataset from the specified filename. 

        """
        
        print("Loading data from filename {}".format(filename))
        dataset=pd.read_csv(filename)
        #need to specify column name, default is text
        self.X = dataset[col].values
        self.X_original= []        
        self.y = dataset.sentiment
        
    
    def report(self):
        """
        Prints relevant information about the dataset 
        """
        # we assume that both X and Y have been correctly loaded
        if self.X is None:
            raise ValueError("Remember to call 'load' to load the dataset!")
        
        print("Number of training examples %d" % len(self.X))
#         classname=["positive", "negative"]
#         classsize=Counter(self.y)
#         print(classsize)
        
#         %matplotlib inline
#         hist_names = [classname[t] for t in self.y]
#         plt.subplot(4,1,4)
#         plt.hist(hist_names)
#         plt.show()
        

        vocabulary = Counter()
        sum_of_words = 0
        
        for words in self.X:
            sum_of_words += len(words)
            for word in words:
                 vocabulary.update([word])
        print("Average sentence length: {} words".format(  
              str(sum_of_words / float(len(self.X)))))
        print("Vobcaulary size: " + str(len(vocabulary)))
        print(vocabulary.most_common(10))
        print(vocabulary.most_common()[:-11:-1])
        
        fdist1= FreqDist(vocabulary)
        for key in fdist1:
            fdist1[key]=math.log(fdist1[key])
        %matplotlib inline
        plt.subplot(4,1,1)
        fdist1.plot(50)
       
        frequence_ratio=[]
        log_rank=[]
        log_frequence=[]
        vocab1 = sorted(vocabulary.items(), key=lambda d:d[1], reverse = True)
        vocab1=np.array(vocab1)
        for index in range(0,vocab1.shape[0]):
            log_rank.append(math.log(index+1))
            frequence_ratio.append(vocab1[index,1]*(index+1))
            log_frequence.append(math.log(vocab1[index,1]))
        print("the mean of frequence ratio":dnp.mean(frequence_ratio))
        print("the std of frequence ratio":np.std(frequence_ratio))
        %matplotlib inline
        plt.subplot(4,1,2)
        plt.plot(np.array(log_rank),np.array(log_frequence))
        plt.title('log(rank) vs log(frequence)')
        plt.xlabel('log(rank)')
        plt.ylabel('log(frequence)')
        plt.show()
        %matplotlib inline
        plt.subplot(4,1,3)
        plt.plot(frequence_ratio)
        plt.title('Plot of frequence ratio')
        plt.show()

      

## Topic Modeling

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [None]:
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))
    
# Storing the entire training text in a list
text = list(train.text.values)
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(text)

In [None]:
## try lda
n_components=11
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

lda.fit(tf)

n_top_words = 40
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

#### Word cloud for each topic:

In [None]:
def word_cloud_sub(num,tf_feature_names=tf_feature_names):
    topic=lda.components_[num]
    topic_words = [tf_feature_names[i] for i in topic.argsort()[:-50 - 1 :-1]]
    cloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          width=2500,
                          height=1800
                         ).generate(" ".join(topic_words))
    plt.imshow(cloud)
    plt.axis('off')
    plt.show()


In [None]:
for num in range(n_components):
    word_cloud_sub(num)