# Text Analysis

In [203]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import string 

## 1. Read in Data

In [204]:
full_data = pd.read_excel('Master JCCS LC Mods.xlsx',sheetname = 'All Data')

In [205]:
#Bayside_data= pd.read_excel('Master JCCS LC Mods.xlsx',sheetname = 'Bayside')
#Lamesa_data= pd.read_excel('Master JCCS LC Mods.xlsx',sheetname = 'La Mesa')
#Reflection_data= pd.read_excel('Master JCCS LC Mods.xlsx',sheetname = 'Reflections')

In [206]:
print("There are totally " + str(full_data.shape[0]) + " records in our data set")

There are totally 235 records in our data set


In [207]:
full_data.head(3)

Unnamed: 0,School,FirstName,LastName,Date,Listening,Playing,Singing,Sharing,Trying,Working,Surprise,Effort,Hopeful,Support
0,Bayside,Evaline,Aeids,2016-12-08,5,5,5,4,3,0,I learned another cord,I sang,I sang with teacher,The cords
1,Bayside,Eveline,Aeids,2017-01-05,4,5,0,3,4,1,I surprised myself by liking how I played the bas,Playing the bass for th first time,Trying a new instrument,No
2,Bayside,Mario,Aguirre,2016-12-15,4,4,0,0,4,0,"Yes, I didn't know guitar was fun and easy",I tried reading the notes,That I am learning a new instrument,Guitar class was nice


## 2. Clean Data

There are some blanks in student response, and I set them to 'N/A' in order to run the code successfully.

In [208]:
full_data.loc[full_data['Surprise'].isnull(), 'Surprise'] = 'N/A'
full_data.loc[full_data['Effort'].isnull(), 'Effort'] = 'N/A'
full_data.loc[full_data['Hopeful'].isnull(), 'Hopeful'] = 'N/A'
full_data.loc[full_data['Support'].isnull(), 'Support'] = 'N/A'

## 3. Analyze text information seperately for four questions

In [209]:
word_surprise = full_data['Surprise']
word_effort = full_data['Effort']
word_hopeful = full_data['Hopeful']
word_support = full_data['Support']

In [272]:
word_surprise = word_surprise.apply(lambda x: x.replace("'",""))
word_effort = word_effort.apply(lambda x: x.replace("'",""))
word_hopeful = word_hopeful.apply(lambda x: x.replace("'",""))
word_support = word_support.apply(lambda x: x.replace("'",""))

### 3.1 Q1--How did you suprise yourself today? 

### Method 1: Manually calculate the word frequency

In [273]:
stopword = stopwords.words('english')

In [274]:
punctuation = set(string.punctuation)
Unifreq = defaultdict(int)
for line in word_surprise:
    r = line.lower()
    r = ''.join([c for c in r if not c in punctuation])
    for word in r.split():
        if not word in stopword:
            Unifreq[word] += 1


In [275]:
Unifreq = [(Unifreq[w], w) for w in Unifreq]
Unifreq.sort()
Unifreq.reverse()
Unifreq[:15]

[(35, 'guitar'),
 (32, 'played'),
 (29, 'playing'),
 (27, 'new'),
 (25, 'song'),
 (25, 'learned'),
 (23, 'play'),
 (21, 'didnt'),
 (12, 'learn'),
 (11, 'chords'),
 (9, 'something'),
 (9, 'learning'),
 (8, 'chord'),
 (7, 'yes'),
 (7, 'good')]

### Method 2: Using CountVectorizer

In [285]:
def MostFreqWord(ngram_range, word_key, limit):
    cv = CountVectorizer(ngram_range = ngram_range, stop_words = 'english',token_pattern = '\\b\\w+\\b')
    cv.fit(word_key)
    countvect = cv.transform(word_key).toarray()
    voc = cv.vocabulary_
    count = countvect.sum(axis = 0)
    freq = dict()
    for word in voc:
        freq[word] = count[voc[word]]    
    freq = [(freq[w], w) for w in freq]
    freq.sort()
    freq.reverse()
    print("The most " + str(limit)+ " frequent keywords and there corresponding frequencies are:")
    print(freq[:limit])
    return(freq[:limit])
       

In [287]:
surprise_uni = MostFreqWord((1,1),word_surprise, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(35, 'guitar'), (32, 'played'), (29, 'playing'), (27, 'new'), (25, 'song'), (25, 'learned'), (23, 'play'), (21, 'didnt'), (12, 'learn'), (11, 'chords'), (10, 'did'), (9, 'learning'), (8, 'chord'), (7, 'yes'), (7, 'good')]


In [288]:
effort_uni = MostFreqWord((1,1),word_effort, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(54, 'playing'), (26, 'play'), (25, 'guitar'), (22, 'song'), (21, 'learning'), (15, 'new'), (14, 'chords'), (10, 'g'), (9, 'chord'), (8, 'strings'), (8, 'played'), (7, 'notes'), (7, 'instrument'), (7, 'bass'), (6, 'trying')]


In [289]:
hopeful_uni = MostFreqWord((1,1),word_hopeful, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(38, 'playing'), (32, 'song'), (30, 'guitar'), (24, 'play'), (22, 'played'), (13, 'music'), (12, 'learning'), (11, 'good'), (10, 'songs'), (10, 'new'), (10, 'learned'), (8, 'making'), (6, 'notes'), (6, 'n'), (6, 'did')]


In [290]:
support_uni = MostFreqWord((1,1),word_support, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(16, 'learning'), (15, 'play'), (12, 'guitar'), (11, 'chords'), (9, 'song'), (9, 'nope'), (8, 'playing'), (8, 'notes'), (7, 'better'), (6, 'time'), (6, 'thanks'), (6, 'practice'), (5, 'want'), (5, 'thank'), (5, 'strings')]


In [291]:
surprise_bi = MostFreqWord((2,2),word_surprise, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(12, 'playing guitar'), (10, 'play guitar'), (10, 'new song'), (7, 'learned new'), (6, 'learn new'), (5, 'played song'), (5, 'learning new'), (4, 'play new'), (4, 'learned play'), (4, 'g chord'), (4, 'electric guitar'), (3, 'played new'), (3, 'played electric'), (3, 'new chords'), (3, 'didnt surprise')]


In [292]:
effort_bi = MostFreqWord((2,2),word_effort, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(10, 'playing guitar'), (7, 'learning new'), (6, 'play guitar'), (6, 'new song'), (4, 'playing song'), (4, 'playing new'), (4, 'playing instrument'), (4, 'new chords'), (3, 'singing class'), (3, 'playing g'), (3, 'played guitar'), (3, 'play song'), (3, 'play g'), (3, 'learned play'), (3, 'g string')]


In [293]:
hopeful_bi = MostFreqWord((2,2),word_hopeful, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(9, 'playing guitar'), (8, 'play guitar'), (6, 'played song'), (4, 'did good'), (3, 'playing music'), (3, 'played guitar'), (3, 'making song'), (3, 'blues riff'), (2, 'wrote song'), (2, 'string c'), (2, 'song good'), (2, 'read notes'), (2, 'playing song'), (2, 'playing new'), (2, 'playing difficult')]


In [294]:
support_bi = MostFreqWord((2,2),word_support, 15)

The most 15 frequent keywords and there corresponding frequencies are:
[(4, 'want learn'), (4, 'learning play'), (3, 'reading notes'), (3, 'playing guitar'), (3, 'play better'), (2, 'time practice'), (2, 'teach play'), (2, 'song time'), (2, 'practice good'), (2, 'play faster'), (2, 'need help'), (2, 'learning notes'), (2, 'learning new'), (2, 'learning chords'), (2, 'extra time')]


In [297]:
[a for (a,b) in support_uni]

[16, 15, 12, 11, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 5]

In [301]:
results = pd.DataFrame()
results['Surprise_unigram'] =  [b for (a,b) in surprise_uni]
results['Surprise_unigram_freq'] =  [a for (a,b) in surprise_uni]
results['Surprise_bigram'] =  [b for (a,b) in surprise_bi]
results['Surprise_bigram_freq'] =  [a for (a,b) in surprise_bi]

In [302]:
results.to_csv("Text_Analysis_Yinzhuo_Ding.csv", index=False)

In [243]:
cv = CountVectorizer(ngram_range = (1,1), stop_words = 'english',token_pattern = '\\b\\w+\\b')

In [244]:
cv.fit(word_surprise)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
        vocabulary=None)

In [245]:
countvect = cv.transform(word_surprise).toarray()
voc = cv.vocabulary_
count = countvect.sum(axis = 0)

In [246]:
freq = dict()
for word in voc:
    freq[word] = count[voc[word]]   

In [247]:
freq = [(freq[w], w) for w in freq]
freq.sort()
freq.reverse()
freq[:15]

[(35, 'guitar'),
 (32, 'played'),
 (29, 'playing'),
 (27, 'new'),
 (25, 'song'),
 (25, 'learned'),
 (24, 't'),
 (23, 'play'),
 (21, 'didn'),
 (12, 'learn'),
 (11, 'chords'),
 (10, 'did'),
 (9, 'learning'),
 (8, 'chord'),
 (7, 'yes')]