In [41]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import random
import operator
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from statistics import *
from sklearn.feature_extraction.text import CountVectorizer
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import textstat
import warnings
import nltk
warnings.filterwarnings('ignore')

%matplotlib inline

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [32]:
data = pd.read_csv('text_data.csv')

In [35]:
data.head()

Unnamed: 0,Student Response Type,Level (Unit),Outcome,Action,Input,Total Num Hints,KC (Default)
0,ATTEMPT,IWT_S09explainTutorA-A,INCORRECT,UpdateComboBox,a,,s12
1,ATTEMPT,IWT_S09explainTutorA-A,CORRECT,UpdateComboBox,a,,s3
2,ATTEMPT,IWT_S09explainTutorA-A,INCORRECT,UpdateComboBox,the,,s1
3,ATTEMPT,IWT_S09explainTutorA-A,INCORRECT,UpdateComboBox,no article,,s8
4,ATTEMPT,IWT_S09explainTutorA-A,INCORRECT,UpdateComboBox,no article,,s11


In [6]:
data['Student Response Type'].value_counts()

ATTEMPT         21519
HINT_REQUEST      880
Name: Student Response Type, dtype: int64

In [10]:
data['Level (Unit)'].value_counts()

IWT_S09explainTutorA-B    3209
IWT_S09articleTutorA-A    3024
IWT_S09explainTutorB-A    2966
IWT_S09articleTutorB-B    2936
IWT_S09explainTutorB-B    2718
IWT_S09explainTutorA-A    2599
IWT_S09articleTutorA-B    2476
IWT_S09articleTutorB-A    2471
Name: Level (Unit), dtype: int64

In [11]:
data['Outcome'].value_counts()

CORRECT          13376
INCORRECT         7974
HINT               709
HINT_NEXT          156
HINT_PREVIOUS       15
Name: Outcome, dtype: int64

In [13]:
data['Action'].value_counts()

UpdateComboBox     19281
ButtonPressed       2170
UpdateTextArea       605
UpdateTextField      343
Name: Action, dtype: int64

In [18]:
data['Problem View'].value_counts()

1    21771
2      538
3       64
4       12
6       11
5        3
Name: Problem View, dtype: int64

In [19]:
data['Total Num Hints'].value_counts()

3.0    331
2.0    322
4.0    212
1.0     10
Name: Total Num Hints, dtype: int64

In [57]:
data['KC (Default)'].value_counts()

s3         2313
s1         2249
s2         2120
unnamed    2103
s4         1792
s11        1550
s8         1408
s12        1394
s7         1387
s6         1367
s9         1353
s10        1207
s5         1152
done        535
e9          290
Name: KC (Default), dtype: int64

# LDA - Topic Modelling

### Scenario

Imagine a scenario where a varsity sales associate and prospective client are having a conversation. They talk for a few minutes and the sales associate feels this lead is not going to convert to a customer. Now, what can we do that can help the sales associate close this call. 

We could provide information that she/he can use in the conversation that gives a better understanding of the prospective customer. 

We can give information at two points during the call.
1. At the start of the call
2. During the conversation

Based on the vasts amounts of call data that varsity has, we can identify topics of concern for people in specific states, students in specific school/ state districts, both characteristic or behavior wise, academic or otherwise. 

For examples:-
Parents of students from California would have drastically different concerns compared to ones in Washington or Nebraska. Student in California might be participating in more after school music and sports class where students in Washington are more concerned to know about the experience of varsity instructors to Washington's education state board curriculum, whereas parents in Nebraska might be more concerned about pricing and frequency of classes. 

Knowing by collecting and processing this data through LDA and Cognitive Task Analysis(Explained later) could facilitate the follow intervention through a pop-up message during a call

<i>"Hey Kevin, this conversation seem similar to the one you had 2 days ago. (or similar to the one Cathy has a month ago)
Your clients main concerns seem to be 
1. Timing
2. Location 

Here is a summary of the call from our past records:-
Problem:- Timing/ Scheduling conflict due to after school sports and music class
Soultion :- Schedule Algebra tutor session during school lunch breaks.
Number of instructors comfortable with that timing: 12"
</i>

Possible solutions for Nebraskan pricing calls could be sign-on with spaced sessions (Make it twice a month instead of 4 times a month).

One of the advantages of this method and a problem it solves is that an LDA model can analyse all the sales conversations at better speed and efficiency than a sales associate.


In [40]:
# SpaCy Parser for questions
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens


In [43]:
data['Input'] = data['Input'].astype(str)

In [68]:
tqdm.pandas()
general_text = data['Input'].progress_apply(spacy_tokenizer)


  0%|          | 0/22399 [00:00<?, ?it/s][A
  2%|▏         | 342/22399 [00:00<00:06, 3413.51it/s][A
  5%|▍         | 1019/22399 [00:00<00:05, 4009.18it/s][A
  7%|▋         | 1620/22399 [00:00<00:04, 4453.59it/s][A
 10%|▉         | 2218/22399 [00:00<00:04, 4821.53it/s][A
 13%|█▎        | 2863/22399 [00:00<00:03, 5216.49it/s][A
 17%|█▋        | 3786/22399 [00:00<00:03, 5998.92it/s][A
 20%|█▉        | 4414/22399 [00:00<00:03, 5894.54it/s][A
 23%|██▎       | 5082/22399 [00:00<00:02, 6109.95it/s][A
 26%|██▌       | 5873/22399 [00:00<00:02, 6557.45it/s][A
 29%|██▉       | 6604/22399 [00:01<00:02, 6762.39it/s][A
 33%|███▎      | 7297/22399 [00:01<00:02, 5846.82it/s][A
 36%|███▋      | 8124/22399 [00:01<00:02, 6409.97it/s][A
 41%|████      | 9195/22399 [00:01<00:01, 7287.51it/s][A
 45%|████▍     | 9993/22399 [00:01<00:01, 6730.81it/s][A
 48%|████▊     | 10780/22399 [00:01<00:01, 7031.91it/s][A
 52%|█████▏    | 11661/22399 [00:01<00:01, 7479.17it/s][A
 56%|█████▋    | 12625/22

### Count Vectorizer

In [69]:
vectorizer_general_text = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
general_text_vectorized = vectorizer_general_text.fit_transform(general_text)


### Applying LDA Model

In [70]:
# Latent Dirichlet Allocation Model
lda_general_text = LatentDirichletAllocation(n_components=20, max_iter=15, learning_method='online',verbose=True)
general_text_lda = lda_general_text.fit_transform(general_text_vectorized)

iteration: 1 of max_iter: 15
iteration: 2 of max_iter: 15
iteration: 3 of max_iter: 15
iteration: 4 of max_iter: 15
iteration: 5 of max_iter: 15
iteration: 6 of max_iter: 15
iteration: 7 of max_iter: 15
iteration: 8 of max_iter: 15
iteration: 9 of max_iter: 15
iteration: 10 of max_iter: 15
iteration: 11 of max_iter: 15
iteration: 12 of max_iter: 15
iteration: 13 of max_iter: 15
iteration: 14 of max_iter: 15
iteration: 15 of max_iter: 15


In [71]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

### Analysing All the Knowledge Components

In [72]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_general_text, general_text_vectorized, vectorizer_general_text, mds='tsne')
dash

#### Explanation

The previous LDA graph does not give use much useful info based on the topics and associated words. By filtering our problem space, we could extract more context. 

We can implement an LDA to identify topics within self-reflection answers for:-
1. Knowledge Component 3 that are tagged incorrect. 
2. Knowledge Component 3 that are tagged correct. 

We can try infer the main topics/problems based on the words.

In [95]:
tqdm.pandas()
general_text = data['Input'][data['KC (Default)'] == 's3'][data['Outcome'] == 'INCORRECT'].progress_apply(spacy_tokenizer)

vectorizer_general_text = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
general_text_vectorized = vectorizer_general_text.fit_transform(general_text)

# Latent Dirichlet Allocation Model
lda_general_text = LatentDirichletAllocation(n_components=5, max_iter=15, learning_method='online',verbose=True)
general_text_lda = lda_general_text.fit_transform(general_text_vectorized)

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_general_text, general_text_vectorized, vectorizer_general_text, mds='tsne')
dash


  0%|          | 0/911 [00:00<?, ?it/s][A
 65%|██████▍   | 592/911 [00:00<00:00, 5914.84it/s][A
100%|██████████| 911/911 [00:00<00:00, 5729.91it/s][A

iteration: 1 of max_iter: 15
iteration: 2 of max_iter: 15
iteration: 3 of max_iter: 15
iteration: 4 of max_iter: 15
iteration: 5 of max_iter: 15
iteration: 6 of max_iter: 15
iteration: 7 of max_iter: 15
iteration: 8 of max_iter: 15
iteration: 9 of max_iter: 15
iteration: 10 of max_iter: 15
iteration: 11 of max_iter: 15
iteration: 12 of max_iter: 15
iteration: 13 of max_iter: 15
iteration: 14 of max_iter: 15
iteration: 15 of max_iter: 15


### Explanation
Once the LDA model is finely tuned to make sense, an expert, in this case an instructor would sit down and look at the topic,, the words associated and a few examples of topic 1 answer examples to assign a name to the topic

KC-3 is identifying grammatical structures from a given sentence. So the question has example text that asks the student to select the noun in the question. 

For Topic 1:- The problem here seems to be something to do with singular noun phrases. 
For Topic 2:- Word noun modifiers

We can infer here that over 80% of the problems seem to be due to difficulty in identifying singular noun phrases and differentiating between noun modifiers. 


In [84]:
tqdm.pandas()
general_text = data['Input'][data['KC (Default)'] == 's3'][data['Outcome'] == 'CORRECT'].progress_apply(spacy_tokenizer)

vectorizer_general_text = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
general_text_vectorized = vectorizer_general_text.fit_transform(general_text)

# Latent Dirichlet Allocation Model
lda_general_text = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online',verbose=True)
general_text_lda = lda_general_text.fit_transform(general_text_vectorized)

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_general_text, general_text_vectorized, vectorizer_general_text, mds='tsne')
dash


  0%|          | 0/1261 [00:00<?, ?it/s][A
 61%|██████    | 764/1261 [00:00<00:00, 7639.48it/s][A
100%|██████████| 1261/1261 [00:00<00:00, 7585.16it/s][A

iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5


### Real-Time Testing
We can test out new data in the following way. 

In [90]:
data['Input'][15]

'The noun is modified with an ordinal number (like "first" "second" "third") or other ranking word (like "next" or "last")'

In [93]:
lda_general_text.transform(general_text_vectorized[15])

array([[0.10000138, 0.10000389, 0.1000009 , 0.59999176, 0.10000207]])

Shows that this particular text is related to topic number 4 which has the highest weight.

# Cognitive Task Analysis

### Scenario

If we identify certain tasks as a set to steps to be performed, we can build a CTA on that task that will help us understand, measure, and evaluate the completion of set of steps and their effects on a particular business or academic outcome.

In sales calls, there are multiple scenarios that sales associates are trained on. We can pick a scenario, represent it as a set of steps, measure/identify their occurances in the conversatgion/communication, and evaluate the importance or probability of a preferred outcome. 

The next set of slides give a high-level understanding of the process. 

On way we can track these steps in natural language is by simple knowing the most representative words of each step. This could either me human coded or an NLP model could aggregrate associated word. A 3-part hybrid approach that I have used in the past is to first come up with a set of CTA steps manually for eprsonal reference, second, crowdsource the text data through MTurk and make participants identify and tag specific parts-of-speech, and third, use an NLP model to train on the tags. 

We will have the Manual CTA and an NLP model that classifies accurately that could be used in production after mapping the Manual CTA to the NLP Model. 

In an academic, context, this will also help us in identifying expert blindspots during tutoring sessions by instructors as shown in the slide below. 

![CTA%20-%201.png](attachment:CTA%20-%201.png)

![CTA%20-%202.png](attachment:CTA%20-%202.png)

![Expert%20Blindspot.png](attachment:Expert%20Blindspot.png)

In [143]:
sw1 = ['python', 'scikit-learn']
sw2 = ['panda','numpy']
sw3 = ['Numpy', 'matplotlib']
sw4 = ['sklearn']
sw5 = ['keras', 'Tensorflow']

#["who", "what", "when", "where", "why", "how", "?"]
#["which", "won't", "can't", "isn't", "aren't", "is", "do", "does", "will", "can", "is"]

'''WHICH, AM, ARE, WAS, WERE, MAY, MIGHT, CAN, COULD, WILL, SHALL, WOULD, SHOULD, HAS, HAVE, HAD, and DID. Perhaps also IF to go with WHEN. Also consider IN, AT, TO, FROM, and ON, plus maybe UNDER and OVER.'''

# Importance of each stage
sww1 = 1
sww2 = 2
sww3 = 2
sww4 = 2
sww5 = 3

In [144]:
data['Stage 1'] = 0
data['Stage 2'] = 0
data['Stage 3'] = 0
data['Stage 4'] = 0
data['Stage 5'] = 0

data['Stage 1'] = data['Input'].apply(lambda x: len([x for x in str(x).replace(';',' ').replace(',', ' ').split(' ') if x in sw1]))
data['Stage 2'] = data['Input'].apply(lambda x: len([x for x in str(x).replace(';',' ').replace(',', ' ').split(' ') if x in sw2]))
data['Stage 3'] = data['Input'].apply(lambda x: len([x for x in str(x).replace(';',' ').replace(',', ' ').split(' ') if x in sw3]))
data['Stage 4'] = data['Input'].apply(lambda x: len([x for x in str(x).replace(';',' ').replace(',', ' ').split(' ') if x in sw4]))
data['Stage 5'] = data['Input'].apply(lambda x: len([x for x in str(x).replace(';',' ').replace(',', ' ').split(' ') if x in sw5]))

data['Stage 1 Score'] = data['Stage 1'] * sww1
data['Stage 2 Score'] = data['Stage 2'] * sww2
data['Stage 3 Score'] = data['Stage 3'] * sww3
data['Stage 4 Score'] = data['Stage 4'] * sww4
data['Stage 5 Score'] = data['Stage 5'] * sww5
data['Final Score'] = data['Stage 1 Score'] + data['Stage 2 Score'] + data['Stage 3 Score'] + data['Stage 4 Score'] + data['Stage 5 Score']

### Syntax Analysis

An alternate approach is by relying entirely on syntax analysis that will give us more information like sentence structures and make the above process more accurate.

In [None]:

SBARQ (Situation, Background, Assessment, Recommendation, Question)

(SBARQ (WH+ (W+) ...)
       (SQ ...*
           (V+) ...*)
       (?))
       
The WH+ node (WHNP/WHADVP/WHADJP) contains the question stem (who/what/when/where/why/how) and the SQ holds the inverted phrase.

(SBARQ 
  (WHNP 
    (WP What)) 
  (SQ 
    (VBZ is) 
    (NP 
      (DT the) 
      (NN question)))
  (. ?))


Penn-II TreeBank or QuestionBank NLTK Parser

https://www.computing.dcu.ie/~jjudge/pubs/judge06acl.pdf

![Syntax%20Tree.png](attachment:Syntax%20Tree.png)

## Readibility Scores

### Scenario

Readibility scores give us a metric that is an estimation of how difficult or hard a particular set of text is to comprehend for people at different levels in school, college, and post college. Modified versions of these metrics are used by the DOD, US Air force to review and maintain consistency in verbal communication and technical manuals to increase quick comprehension and to avoid cognitive roadblocks. Several book publishers use these as a metric to assess the ideal audience/market segment for a particular book. 

At varsity, we can use this for sales conversations, instructor students sessions, and written student assessments and feedback. 

Each readibility metrics gives a particular score for a given text. a score of 6.4 means that a student in 6th grade should be able to read this text but would be hard for a student in 5th grade. 

In sales conversations, if the conversation is with a student, we migh want to avoid large deviations from the grade of the particular student If you are talking with a parent, The goal is not to check for the agreement between a parents age with the score of the conversation. We might want to avoid having conversations that is specific to 5th grade curriculum vocabulary. Eventhough a parent can theoretically understand 5th grade level conversations, they probably would not remember concepts in studied during 5th grade. 

Additionally, this also relates to the point mentioned about trying to relate a student's ability from one concept to another especially if we are trying to transfer qualities from abstract concepts like art to concrete concepts like math or Integral calculus to applied physics. 

Below is a study I mentioned about difficulty of story, word, vs equation based problem that is related to this. 

In [146]:
<img src="SWE Example.png]">

SyntaxError: invalid syntax (<ipython-input-146-12d8dc7fce77>, line 1)

#### Movie Reviews Dataset

In [132]:
from nltk.corpus import movie_reviews
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split("/")
    reviews.append((filename, tag, movie_reviews.raw(fileid)))
    
fdf = pd.DataFrame(reviews, columns = ['filename','tag','text'])

In [117]:
%matplotlib inline

from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

import textstat
from tqdm import tqdm
tqdm.pandas()
import numpy as np
from statistics import *

In [119]:
def plotability2v(a,b,title,bins=0.1,colors = ['#3A4750', '#F64E8B']):
    trace1 = ff.create_distplot([a,b],['Positive','Negative'], bin_size=bins, colors=colors, show_rug = False)
    trace1['layout'].update(title=title)
    iplot(trace1, filename="DistPlot")
    table_data= [["Statistical Measures","Sincere questions","Insincere questions"],
                ["Mean",mean(a),mean(b)],
                ["Standard Deviation",pstdev(a),pstdev(b)],
                ["Variance",pvariance(a),pvariance(b)],
                ["Median",median(a),median(b)],
                ["Maximum value",max(a),max(b)],
                ["Minimum value",min(a),min(b)]]
    trace2 = ff.create_table(table_data)
    iplot(trace2, filename='Table')

def plotability1v(a,title,bins=0.1,colors = ['#F64E8B']):
    trace1 = ff.create_distplot([a],['Single'], bin_size=bins, colors=colors, show_rug = False)
    trace1['layout'].update(title=title)
    iplot(trace1, filename="DistPlot")
    table_data= [["Statistical Measures","Sincere questions"],
                ["Mean",mean(a)],
                ["Standard Deviation",pstdev(a)],
                ["Variance",pvariance(a)],
                ["Median",median(a)],
                ["Maximum value",max(a)],
                ["Minimum value",min(a)]]
    trace2 = ff.create_table(table_data)
    iplot(trace2, filename='Table')   
    
# One function for all plots
def plot_readability(a,b,title,bins=0.1,colors=['#3A4750', '#F64E8B']):
    trace1 = ff.create_distplot([a,b], ["Sincere questions","Insincere questions"], bin_size=bins, colors=colors, show_rug=False)
    trace1['layout'].update(title=title)
    iplot(trace1, filename='Distplot')
    table_data= [["Statistical Measures","Correct Answers","Incorrect Answers"],
                ["Mean",mean(a),mean(b)],
                ["Standard Deviation",pstdev(a),pstdev(b)],
                ["Variance",pvariance(a),pvariance(b)],
                ["Median",median(a),median(b)],
                ["Maximum value",max(a),max(b)],
                ["Minimum value",min(a),min(b)]]
    trace2 = ff.create_table(table_data)
    iplot(trace2, filename='Table')

### Each of these metrics can be trained on specific domains or a set of problems to give us more accurate results. The below models are baseline generalised public versions. 

### The Flesch Reading Ease formula

In [133]:
fdf['fre_pos_i'] = fdf['text'][fdf['tag']=='pos'].apply(lambda x: textstat.flesch_kincaid_grade(x))
fdf['fre_neg_i'] = fdf['text'][fdf['tag']=='neg'].apply(lambda x: textstat.flesch_kincaid_grade(x))
fdf['sent_count'] = fdf['text'].apply(lambda x: len([x for x in x.split('.')]))
fdf['fre_pos_i'] = fdf['fre_pos_i']/fdf['sent_count']
fdf['fre_neg_i'] = fdf['fre_neg_i']/fdf['sent_count']
fre_pos = fdf['fre_pos_i']
fre_neg = fdf['fre_neg_i']
fre_pos = [x for x in fre_pos if str(x) != 'nan']
fre_neg = [x for x in fre_neg if str(x) != 'nan']
plot_readability(fre_pos,fre_neg,"Flesch-Kincaid Reading Ease Grade Level", 0.5, colors=['#EE82EE','#0d98ba'])

### The Fog Scale (Gunning FOG Formula)

In [134]:
fdf['fre_pos_i'] = fdf['text'][fdf['tag']=='pos'].apply(lambda x: textstat.gunning_fog(x))
fdf['fre_neg_i'] = fdf['text'][fdf['tag']=='neg'].apply(lambda x: textstat.gunning_fog(x))
fdf['sent_count'] = fdf['text'].apply(lambda x: len([x for x in x.split('.')]))
fdf['fre_pos_i'] = fdf['fre_pos_i']/fdf['sent_count']
fdf['fre_neg_i'] = fdf['fre_neg_i']/fdf['sent_count']
fre_pos = fdf['fre_pos_i']
fre_neg = fdf['fre_neg_i']
fre_pos = [x for x in fre_pos if str(x) != 'nan']
fre_neg = [x for x in fre_neg if str(x) != 'nan']
plot_readability(fre_pos,fre_neg,"The Fog Scale (Gunning FOG Formula)", 0.5, colors=['#EE82EE','#0d98ba'])

### Automated Readibility Index

In [135]:
fdf['fre_pos_i'] = fdf['text'][fdf['tag']=='pos'].apply(lambda x: textstat.automated_readability_index(x))
fdf['fre_neg_i'] = fdf['text'][fdf['tag']=='neg'].apply(lambda x: textstat.automated_readability_index(x))
fdf['sent_count'] = fdf['text'].apply(lambda x: len([x for x in x.split('.')]))
fdf['fre_pos_i'] = fdf['fre_pos_i']/fdf['sent_count']
fdf['fre_neg_i'] = fdf['fre_neg_i']/fdf['sent_count']
fre_pos = fdf['fre_pos_i']
fre_neg = fdf['fre_neg_i']
fre_pos = [x for x in fre_pos if str(x) != 'nan']
fre_neg = [x for x in fre_neg if str(x) != 'nan']
plot_readability(fre_pos,fre_neg,"Automated Readibility Index", 0.5, colors=['#EE82EE','#0d98ba'])

### The Coleman-Liau Index

In [136]:
fdf['fre_pos_i'] = fdf['text'][fdf['tag']=='pos'].apply(lambda x: textstat.coleman_liau_index(x))
fdf['fre_neg_i'] = fdf['text'][fdf['tag']=='neg'].apply(lambda x: textstat.coleman_liau_index(x))
fdf['sent_count'] = fdf['text'].apply(lambda x: len([x for x in x.split('.')]))
fdf['fre_pos_i'] = fdf['fre_pos_i']/fdf['sent_count']
fdf['fre_neg_i'] = fdf['fre_neg_i']/fdf['sent_count']
fre_pos = fdf['fre_pos_i']
fre_neg = fdf['fre_neg_i']
fre_pos = [x for x in fre_pos if str(x) != 'nan']
fre_neg = [x for x in fre_neg if str(x) != 'nan']
plot_readability(fre_pos,fre_neg,"The Coleman-Liau Index", 0.5, colors=['#EE82EE','#0d98ba'])

### Linsear Write Formula

In [138]:
fdf['fre_pos_i'] = fdf['text'][fdf['tag']=='pos'].apply(lambda x: textstat.linsear_write_formula(x))
fdf['fre_neg_i'] = fdf['text'][fdf['tag']=='neg'].apply(lambda x: textstat.linsear_write_formula(x))
fdf['sent_count'] = fdf['text'].apply(lambda x: len([x for x in x.split('.')]))
fdf['fre_pos_i'] = fdf['fre_pos_i']/fdf['sent_count']
fdf['fre_neg_i'] = fdf['fre_neg_i']/fdf['sent_count']
fre_pos = fdf['fre_pos_i']
fre_neg = fdf['fre_neg_i']
fre_pos = [x for x in fre_pos if str(x) != 'nan']
fre_neg = [x for x in fre_neg if str(x) != 'nan']
plot_readability(fre_pos,fre_neg,"Linsear Write Formula", 0.5, colors=['#EE82EE','#0d98ba'])

### Dale-Chall Readability Score

In [139]:
fdf['fre_pos_i'] = fdf['text'][fdf['tag']=='pos'].apply(lambda x: textstat.dale_chall_readability_score(x))
fdf['fre_neg_i'] = fdf['text'][fdf['tag']=='neg'].apply(lambda x: textstat.dale_chall_readability_score(x))
fdf['sent_count'] = fdf['text'].apply(lambda x: len([x for x in x.split('.')]))
fdf['fre_pos_i'] = fdf['fre_pos_i']/fdf['sent_count']
fdf['fre_neg_i'] = fdf['fre_neg_i']/fdf['sent_count']
fre_pos = fdf['fre_pos_i']
fre_neg = fdf['fre_neg_i']
fre_pos = [x for x in fre_pos if str(x) != 'nan']
fre_neg = [x for x in fre_neg if str(x) != 'nan']
plot_readability(fre_pos,fre_neg,"Dale-Chall Readability Score", 0.5, colors=['#EE82EE','#0d98ba'])

### All Consensus

In [141]:
def consensus_all(text):
    return textstat.text_standard(text,float_output=True)

fdf['fre_pos_i'] = fdf['text'][fdf['tag']=='pos'].apply(lambda x: consensus_all(x))
fdf['fre_neg_i'] = fdf['text'][fdf['tag']=='neg'].apply(lambda x: consensus_all(x))
fdf['sent_count'] = fdf['text'].apply(lambda x: len([x for x in x.split('.')]))
fdf['fre_pos_i'] = fdf['fre_pos_i']/fdf['sent_count']
fdf['fre_neg_i'] = fdf['fre_neg_i']/fdf['sent_count']
fre_pos = fdf['fre_pos_i']
fre_neg = fdf['fre_neg_i']
fre_pos = [x for x in fre_pos if str(x) != 'nan']
fre_neg = [x for x in fre_neg if str(x) != 'nan']
plot_readability(fre_pos,fre_neg,"All Consensus", 0.5, colors=['#EE82EE','#0d98ba'])

# Sales Analysis - CTA/ Topic Modelling/ Categorical

### <b>Priming</b>

Priming is a phenomenon when a stimulus prior to an event changes the participants behavior during the event. 

General Example: Undergrads Lavish Spending after watching looking at photos of other students buying expensive clothes etc. 
Varsity Specific Example:- As given in the LDA example above, by personalizing the Varsity website to problem topics in each state, say in california we promote the sense of starting a tutoring session any place anytime since timing is a bigh issue, whereas in nebraska we focus on affordability and ROI.

Eduardo Porter, “How Money Affects Morality,” New York Times (January 3, 2013), http://economix.blogs.nytimes.com/2013/06/13/how-money- affects-morality/, accessed February 23, 2015.

Example:- French and German Wine and Music in Restaurant

Adrian C. North, David J. Hargreaves, and Jennifer McKendrick, “The Influence of In-Store Music on Wine Selections,” Journal of Applied Psychology 84, no. 2 (1999): 271–276.

### <b>Nudging</b>

General Example:- Default Organ Donors are less likely to leave the organ donor program vs asking non-donors to Opt-In

https://sparq.stanford.edu/solutions/opt-out-policies-increase-organ-donation

Davidai, S., Gilovich, T., & Ross, L. (2012). The meaning of default options for potential organ donors. Proceedings of the National Academy of Sciences, 15201-15205.

### <b>Covariance</b>

General Example:- Expectation that if the car's exterior is clean, then the interior is clean as well, including the motor parts.
Varsity Specific Example:- Design of website and quality of reviews posted on reviews would improve trust and preception of services provided.
Michael R. Solomon, Sarah Drenan, and Chester A. Insko, “Popular Induction: When Is Consensus Information Informative?” Journal of Personality 49, no. 2 (1981): 212–224.

Howard Beales, Michael B. Mazis, Steven C. Salop and Richard Staelin, “Consumer Search and Public Policy,” Journal of Consumer Research 8, no. 1 (June 1981): 11–22.


### <b>Family - Collective Decision Making</b>

Varsity Specific Example:- By personalizing emails/ brochures/ sales calls based on age of student and role of the person calling varsity to know about the service as stated below.

<b>Different Roles</b>
1. <b>Initiator</b> — The person who brings up the idea or identifies a need.
2. <b>Gatekeeper</b> — The person who conducts the information search and controls the flow of information available to the group. In organizational contexts, the gatekeeper identifies possible vendors and products for the rest of the group to consider.
3. <b>Influencer </b>— The person who tries to sway the outcome of the decision. Some people may be more motivated than others to get involved, and participants also possess different amounts of power to get their point across.
4. <b>Buyer </b>— The person who actually makes the purchase. The buyer may or may not actually use the product.
5. <b>User</b> — The person who actually consumes the product or service.

<b>Different Types of decisions in a family:-</b>

<b>1. Consensual Purchase Decision</b>

Example: - Buying a Dog and splitting up the responsibilities amobg family members

<b>2. Accomodative Purchase Decision</b>

Example: - TV Subscription
Involves bargaining, coercion, and compromise


Harry L. Davis, “Decision-Making Within the Household,” Journal of Con- sumer Research 2 (March 1972): 241–260; Michael B. Menasco and David J. Curry, “Utility and Choice: An Empirical Study of Wife/Husband Decision- Making,” Journal of Consumer Research 16 (June 1989): 87–97; Conway Lackman and John M. Lanasa, “Family Decision-Making Theory: An Overview and Assessment,” Psychology & Marketing 10 (March–April 1993): 81–94.

Shannon Dortch, “Money and Marital Discord,” American Demographics (October 1994): 11.

### Loss Aversion

General Example:- Teacher's paid an extra bonus at the start of the year to improve a student's progress but were told that if the student's performance does not increase the bonus would be deducted from salaries at the end of the year are more dedicated to improving the student's learning compared to teachers who were promised bonus only at the end of the year. 

Varsity Specific Example:- Increase Tutor motivation through incentives prior to an assignment vs at the end. 

https://www.npr.org/sections/health-shots/2012/09/18/161159263/teachers-expectations-can-influence-how-students-perform
