In [1]:
import re
import itertools
import pandas as pd
import datetime as dt
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import RegexpTokenizer
import string
import scipy.sparse as sparse
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from nltk.tokenize import word_tokenize

import spacy
import en_core_web_lg
from spacy import displacy
nlp = en_core_web_lg.load()
from starterUtil import getCourseReviewsbyTag, getTagsList
from preprocessing_Util import generateCleanDF, removeStopwords
from spellchecker import SpellChecker
from sklearn import cluster
from sklearn import metrics


In [2]:
NUM_CLUSTERS = 100

topNounWords = {}
tags = getTagsList()
cleanDFwoStopwords = pd.read_csv("cleanReviewswithoutStopwords")

In [12]:
def createTopNounDict(tag):
    if tag == ' ':
        cleanedCorpus = cleanDFwoStopwords['reviews']
    else:
        cleanedCorpus = cleanDFwoStopwords[cleanDFwoStopwords['Tags'] == tag]['reviews']
    noun_dict = {}
    nounTags = ['NN','NNS','NNP','NNPS']
    for review in cleanedCorpus:
        tokens = word_tokenize(str(review))
        postags = get_postags(tokens)
        for i in range(len(postags)):
            if postags[i] in nounTags:
                if tokens[i] in noun_dict:
                    noun_dict[tokens[i]] += 1
                else:
                    noun_dict[tokens[i]] = 1
    #Finding the most appearing nouns

    for key,value in noun_dict.items():
        if value > 500:
            topNounWords[key] = value
    return topNounWords
                    
def get_postags(row):
    
    postags = nltk.pos_tag(row)
    list_classes = list()
    for  word in postags:
        list_classes.append(word[1])
    
    return list_classes

def clusteredTopNouns(tag):
    topNounWords = createTopNounDict(tag)
        

    spell = SpellChecker(distance=2)  # set at initialization
    misspelled = spell.unknown(list(topNounWords.keys()))
    for word in misspelled:
        if spell.correction(word) in topNounWords:
            topNounWords[spell.correction(word)] = topNounWords[spell.correction(word)] + topNounWords.pop(word)
     
    wordvectors = {}
    for index,row in topNounWords.items():
        wordvector = nlp(index).vector
        wordvectors[index] = wordvector
        
    X = np.zeros((len(wordvectors),300))
    for i, (word, vector) in enumerate(wordvectors.items()):
        X[i] = vector

    kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS,max_iter=1000)
    kmeans.fit(X)       
    labels = kmeans.labels_

    clusters = {}
    for (word,label) in zip([*wordvectors] , labels):

        count = topNounWords[word]
        if label in clusters:
            clusters[label].append((word, count))
        else:
            clusters[label] = [(word,count)]
            
    filteredFeatures = {}
    for i,clster in enumerate(clusters):
        counter = list(zip(*clusters[clster]))[1]
        words = list(zip(*clusters[clster]))[0]
        word = words[counter.index(max(counter))]
        filteredFeatures[word] = max(counter)
    
    return filteredFeatures
        

In [13]:
topNouns = clusteredTopNouns("['Data Science', 'Machine Learning']")

In [14]:
topNouns

{'course': 116607,
 'understand': 4883,
 'problems': 3737,
 'details': 2992,
 'week': 3349,
 'way': 10391,
 'questions': 1997,
 'videos': 3897,
 'learning': 20390,
 'theory': 2591,
 'programming': 5936,
 'concepts': 14656,
 'data': 3649,
 'assignments': 14986,
 'instructor': 3011,
 'lot': 11602,
 'field': 4086,
 'students': 3345,
 'part': 3569,
 'Professor': 3569,
 'things': 4646,
 'functions': 1277,
 'solutions': 570,
 'notes': 1043,
 'teaching': 1265,
 'Andrew': 26154,
 'Ng': 18674,
 'machine': 21942,
 'Python': 3594,
 'lectures': 3905,
 'ML': 11997,
 'content': 4463,
 'forums': 944,
 'Prof.': 3385,
 'team': 2689,
 'level': 3700,
 'nan': 34165,
 'Great': 6140,
 'knowledge': 8271,
 'implementation': 2236,
 'math': 4160,
 'material': 5236,
 'introduction': 7058,
 'class': 6148,
 'quizzes': 885,
 'Coursera': 4703,
 'practice': 3238,
 'networks': 6230,
 'algebra': 2054,
 'Deep': 5075,
 'help': 2979,
 'Thank': 8896,
 'techniques': 2614,
 'models': 2495,
 'points': 617,
 'topics': 5328,
 '

In [32]:
    
def tagRelevantUnigramProb(unigrams, tag):
    streamReviews = cleanDFwoStopwords[cleanDFwoStopwords['Tags'] == tag]['reviews'].apply(lambda x: str(x))
    reviewText = " ".join(streamReviews)
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(reviewText)
    unigramsProb = {}
    for word, ct in unigrams.items():
        count = phraseCountinText(str(word), reviewText)
        if count != 0:
            unigramsProb[word] = -np.log(count/len(words))
    return unigramsProb  

def allUnigramsProb(unigrams):
    raw_text = " ".join(cleanDFwoStopwords['reviews'].apply(lambda x: str(x)))
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(raw_text)
    unigramsProb = {}
    for word, ct in unigrams.items():
        count = phraseCountinText(str(word),raw_text)
        unigramsProb[word] = -np.log(count/len(words))
    return unigramsProb        
            
def phraseCountinText(phrase, text):
    return len(re.findall(phrase,text))
            
def getTagRelevantFeatures(tag, unigrams, threshold):
    tagUnigramProb = tagRelevantUnigramProb(unigrams, tag)
    allUnigramProb = allUnigramsProb(unigrams)
    tagRelevantFeatures = {}
    for i, word in enumerate(tagUnigramProb):
        if allUnigramProb[word] - tagUnigramProb[word] > 0.8:
            tagRelevantFeatures[word] = unigrams[word]
    return tagRelevantFeatures

In [33]:
tagRelevantFeatures = getTagRelevantFeatures("['Data Science', 'Machine Learning']", topNouns, 2)

In [34]:
tagRelevantFeatures

{'learning': 20390,
 'Andrew': 26154,
 'Ng': 18674,
 'machine': 21942,
 'ML': 11997,
 'implementation': 2236,
 'math': 4160,
 'networks': 6230,
 'algebra': 2054,
 'Deep': 5075,
 'models': 2495,
 'exercises': 6756,
 'notebook': 989,
 'Octave': 1843,
 'Stanford': 885,
 'intuition': 2664,
 'NN': 1915,
 'Neural': 3463,
 'applications': 2930,
 'propagation': 1069,
 'Matlab': 1265,
 'algorithms': 4485,
 'regression': 872,
 'Sir': 1209,
 'Linear': 759,
 'sequence': 755,
 'optimization': 948,
 'NLP': 825,
 'hyperparameters': 1040}

In [39]:
tagsList = list(tags)
tagsList.remove('None')

In [40]:
tagsList

["['Arts and Humanities', 'History']",
 "['Arts and Humanities', 'Music and Art']",
 "['Arts and Humanities', 'Philosophy']",
 "['Business', 'Business Essentials']",
 "['Business', 'Business Strategy']",
 "['Business', 'Entrepreneurship']",
 "['Business', 'Finance']",
 "['Business', 'Leadership and Management']",
 "['Business', 'Marketing']",
 "['Computer Science', 'Algorithms']",
 "['Computer Science', 'Computer Security and Networks']",
 "['Computer Science', 'Design and Product']",
 "['Computer Science', 'Mobile and Web Development']",
 "['Computer Science', 'Software Development']",
 "['Data Science', 'Data Analysis']",
 "['Data Science', 'Machine Learning']",
 "['Data Science', 'Probability and Statistics']",
 "['Health', 'Animal Health']",
 "['Health', 'Basic Science']",
 "['Health', 'Health Informatics']",
 "['Health', 'Healthcare Management']",
 "['Health', 'Nutrition']",
 "['Health', 'Patient Care']",
 "['Health', 'Psychology']",
 "['Health', 'Public Health']",
 "['Health', 'R

In [2]:
#Multiclass Sentiment Analyser
import pandas as pd
import datetime as dt
import numpy as np
coursereviews = pd.read_csv("Coursera_reviews.csv")

In [11]:
coursereviews.set_index('rating', inplace=True)

In [38]:
for index in range(1,6,1):
    print(index)

1
2
3
4
5


In [33]:
trainingData = pd.DataFrame(columns = ['review','rating'])
sentencelist = sum(list(map(lambda x: sent_tokenize(str(x)),coursereviews.loc[1]['reviews'])),[])
count = len(sentencelist)
pd.DataFrame(data = [sentencelist, [1]*count], index=['review','rating']).T

Unnamed: 0,review,rating
0,This course is virtually worthless.,1
1,"I couldn't follow the lectures, and I have a P...",1
2,The online course is based on snippets taken f...,1
3,The instructor frequently refers to concepts t...,1
4,This online version of the Yale course was obv...,1
...,...,...
51186,"Some quizzes are senseless, the course itself ...",1
51187,"I did not receive my certificate, though have ...",1
51188,The lessons and teachings do not match up with...,1
51189,i did'nt got my certificate and this site id v...,1


In [41]:
trainingData = pd.DataFrame(columns = ['review','rating'])
for index in range(1,6,1):
    if index == 4 or index == 5:
        samplereviews = coursereviews.loc[index].sample(n = 50000)
    else:
        samplereviews = coursereviews.loc[index]
    sentencelist = sum(list(map(lambda x: sent_tokenize(str(x)),samplereviews['reviews'])),[])
    count = len(sentencelist)
    if (count > 110000):
        sentencelist = random.sample(sentencelist, 110000)
    print(count)
    trainingData = pd.concat([trainingData, pd.DataFrame(data = [sentencelist, [index]*count], index=['review','rating']).T])

51191
48399
109798
95481
93775


In [40]:
from nltk.tokenize import sent_tokenize
import random

In [22]:
sum([[1,2,3],[4,5,6]],[])

TypeError: sum() takes no keyword arguments

In [42]:
trainingData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398644 entries, 0 to 93774
Data columns (total 2 columns):
review    398644 non-null object
rating    398644 non-null object
dtypes: object(2)
memory usage: 9.1+ MB


In [44]:
trainingData.to_csv("trainingData.csv")

In [8]:
import pandas as pd
x = pd.read_csv("trainingdata.csv")

In [10]:
x.drop(labels = ['Unnamed: 0'], inplace=True, axis = 1)

In [11]:
x.to_csv("traindata.csv")

In [9]:
x

Unnamed: 0.1,Unnamed: 0,review,rating
0,0,This course is virtually worthless.,1
1,1,"I couldn't follow the lectures, and I have a P...",1
2,2,The online course is based on snippets taken f...,1
3,3,The instructor frequently refers to concepts t...,1
4,4,This online version of the Yale course was obv...,1
...,...,...,...
398639,398639,I look forward to put the skills I've acquired...,5
398640,398640,Best end lab ever!,5
398641,398641,"very useful,build a good foundation!",5
398642,398642,Fantastic Program,5
