# Prerequisites & Load Data

In [1]:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [5]:
df_raw = pd.read_excel('~/Documents/work/DataKind/Teach For America/TFAdata_final.xlsx', sheet_name='TFAdata_final')
df_raw = df_raw[pd.to_numeric(df_raw['PID'], errors='coerce').notnull()]

In [6]:
df_raw.head(2)

Unnamed: 0,PID,Survey Code,Region,Corps,Person of Color,Applicant Type,Economic Background,Grade Level,Subject Group,Institute,...,csi4,csi5,csi6,csi7,csi8,csi10,csi12,gensat1,nps,npscomment
0,2334439,1617EYS,Hawai'i,2nd year,Person of color,CGP,Low Income Background,HIGH,Social Studies,Phoenix Institute,...,,,,,,,,,,
1,2480034,1617EYS,New York,2nd year,Not person of color,CGP,Low Income Background,ELEM,General Ed,New York Institute,...,,,,,,,,,,


In [7]:
df = df_raw.copy()

In [8]:
cat_econ = {'Not Low Income Background':0, 'Low Income Background':1}
df['isLowIncome'] = df['Economic Background'].map(cat_econ)

df['is2nd'] = df.Corps.astype("category").cat.codes

cat_grade = {'NO GRADE':0, 'PRE-K':1, 'K':2, 'ELEM':3, 'MIDDLE':4, 'HIGH':5}
df['gradeLevel'] = df['Grade Level'].map(cat_grade)

cat_poc = {'Person of color':1, 'Not person of color':0, 'No ethnicity listed':None,
       'Not Person of Color':0, 'Person of Color':1}
df['isPOC'] = df['Person of Color'].map(cat_poc)

df.profile_report()



# Text Cleaning, Lemmatize, Stemming, Pos Tagging

In [46]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

import re
import string
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))
stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()
trans = str.maketrans(string.punctuation, " "*len(string.punctuation))


def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def cleanText(text, isStem = False, isLem = False, ):
    text = text.lower()
    text = text.translate(trans)
    text = word_tokenize(text)
    pos_tags = pos_tag(text)
#     text = [lemmatizer.lemmatize(stemmer.stem(i)) for i in text if i not in stop_words]
    return " ".join(text), pos_tags

strSample = "I answer neutrally because this depends entirely on the individual and their passion/motivation to begin an experience like TFA."
cleanText(strSample)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tanyanabila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tanyanabila/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


('i answer neutrally because this depends entirely on the individual and their passion motivation to begin an experience like tfa',
 [('i', 'NN'),
  ('answer', 'VBP'),
  ('neutrally', 'RB'),
  ('because', 'IN'),
  ('this', 'DT'),
  ('depends', 'VBZ'),
  ('entirely', 'RB'),
  ('on', 'IN'),
  ('the', 'DT'),
  ('individual', 'NN'),
  ('and', 'CC'),
  ('their', 'PRP$'),
  ('passion', 'NN'),
  ('motivation', 'NN'),
  ('to', 'TO'),
  ('begin', 'VB'),
  ('an', 'DT'),
  ('experience', 'NN'),
  ('like', 'IN'),
  ('tfa', 'NN')])

# Sentiment Analysis

In [52]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

df_comment = df.dropna()
df_comment['cleanComment'] = [cleanText(i)[0] for i in df_comment.npscomment]
df_comment["sentiments"] = df_comment["cleanComment"].apply(lambda x: sid.polarity_scores(x))
df_comment = pd.concat([df_comment.drop(['sentiments'], axis=1), df_comment['sentiments'].apply(pd.Series)], axis=1)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tanyanabila/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [53]:
df_comment.head(1)

Unnamed: 0,PID,Survey Code,Region,Corps,Person of Color,Applicant Type,Economic Background,Grade Level,Subject Group,Institute,...,npscomment,isLowIncome,is2nd,gradeLevel,isPOC,cleanComment,neg,neu,pos,compound
9,2933019,1617EYS,Rhode Island,2nd year,Not person of color,Applied as Senior,Not Low Income Background,HIGH,Math,Philadelphia Institute,...,There are a lot of people who shouldn't do TFA...,0.0,1,5.0,0.0,there are a lot of people who shouldn t do tfa...,0.0,0.912,0.088,0.5095
