In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def extract_phrases(complaint):
    """
    Return a list of unique phrases in one complaint
    """
    # get sentences
    sen = re.sub(r',|\.|\:|\;|\?|\!',' ,', complaint) 
    
    #remove digits
    sen = ' '.join(' ' if any(c.isdigit() for c in s) else s for s in sen.split()) 
    s_list = re.split(r',|\.|\:|\;|\?|\!|\_', sen)
    temp = []
    
    #get phrases
    for s in s_list:
        s = re.sub(r'[^\w\s]',' ', s)
        words = s.split()
        for i in range(5):
            for j in range(len(words) - i):
                temp.append(' '.join(words[j:j + i + 1]))
    
    #with no duplicates
    return list(set(temp))            

In [3]:
def phrase_table(start, end):
    '''
    Return table with phrases and corresponding complaint ids
    '''
    phrases = []
    cids = []

    for i in range(start, end):
        cid = df['ID'][i]
        temp_phrases = extract_phrases(df['Complaint'][i])  #extract
        temp_ids = [cid] * len(temp_phrases)
        phrases = phrases + temp_phrases
        cids = cids + temp_ids
    
    d = pd.DataFrame({'Phrase': phrases})
    d['ID'] = cids
    return d

## Extract Phrases and Get Counts

In [4]:
df_recall = pd.read_csv('Complaint_Verified_Encoded.csv')

In [5]:
df_phrase = phrase_table(0, 1249653)

In [6]:
df_phrase = pd.merge(df_phrase, df_recall, on = 'ID', how = 'left')

In [7]:
# group by phrase and recall class
df_count = df_phrase.groupby(by = ['Phrase', 'Recall']).size().reset_index(name = 'Counts')
df_count.to_csv('totalcount.csv', index = False)

In [9]:
d = pd.read_csv('totalcount.csv')
d0 = d[d.Recall == 0]
d1 = d[d.Recall == 1]

In [10]:
dffreq = pd.merge(d0,d1, how = 'outer', on='Phrase')
dffreq = dffreq[['Phrase','Non-recall Counts','Recall Counts']]
dffreq = dffreq.sort_values('Phrase') # sort alphabetically

In [11]:
dffreq.fillna(0, inplace = True) # fill nan's with zero

In [12]:
dffreq['Non-recall Counts'] = dffreq['Non-recall Counts'].astype(int)
dffreq['Recall Counts'] = dffreq['Recall Counts'].astype(int)
dffreq['Total Counts'] = dffreq['Non-recall Counts'] + dffreq['Recall Counts']

In [13]:
dffreq.to_csv('Frequency_Table.csv', index = False)

In [15]:
dffreq = dffreq[dffreq['Total Counts'] >= 10] 
dffreq.head()

Unnamed: 0,Phrase,Non-recall Counts,Recall Counts,Total Counts
0,A,61247,716654,777901
1,A A,39,227,266
17,A A C,1,12,13
45,A A MONTH,1,11,12
77,A AAA,4,55,59


In [17]:
dffreq.reset_index(drop = True, inplace = True)

## Calculate Phrase Score

In [25]:
def calc_ratio(df, p, name):
    nrc = df['Non-recall Counts'][i]
    rc = df['Recall Counts'][i]
    
    # inf if all are recalled
    if nrc == 0: 
        df.set_value(i, name, float('Inf'))
    
    # 0 if all are non recalled
    elif rc == 0:
        df.set_value(i, name, 0)
    else:
        df.set_value(i, name, rc/nrc)

In [26]:
for i in range(len(dffreq)):
    p = dffreq.Phrase[i]
    calc_ratio(dffreq, p, 'Ratio')

In [28]:
dffreq['log'] = dffreq.Ratio.apply(lambda x: np.log(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
dffreq.head()

Unnamed: 0,Phrase,Non-recall Counts,Recall Counts,Total Counts,Ratio,log
0,A,61247,716654,777901,11.701047,2.459678
1,A A,39,227,266,5.820513,1.761388
2,A A C,1,12,13,12.000000,2.484907
3,A A MONTH,1,11,12,11.000000,2.397895
4,A AAA,4,55,59,13.750000,2.621039
5,A AAA SERVICE,0,13,13,inf,inf
6,A ABLE,1,9,10,9.000000,2.197225
7,A ABLE TO,1,9,10,9.000000,2.197225
8,A ABNORMAL,0,19,19,inf,inf
9,A ABOUT,6,80,86,13.333333,2.590267


In [30]:
dffreq.to_csv('Phrase_Ratio_05_14.csv')

In [31]:
# 5 seperate files w/ diff phrase len

phrase = dffreq.Phrase
dffreq['wordcount'] = dffreq.Phrase.apply(lambda x: len(x.split()))
dffreq[dffreq.wordcount == 1].to_csv('One_Word_Phrase_w_Score.csv', index = False)
dffreq[dffreq.wordcount == 2].to_csv('Two_Words_Phrase_w_Score.csv', index = False)
dffreq[dffreq.wordcount == 3].to_csv('Three_Words_Phrase_w_Score.csv', index = False)
dffreq[dffreq.wordcount == 4].to_csv('Four_Words_Phrase_w_Score.csv', index = False)
dffreq[dffreq.wordcount == 5].to_csv('Five_Words_Phrase_w_Score.csv', index = False)

## Calculate Complaint Score

In [5]:
dffreq = dffreq[dffreq.log > 1] 
phrase = dffreq.Phrase
phrase_ratio = pd.Series(dffreq.log.values, index = dffreq.Phrase).to_dict()

In [9]:
def calc_comp_score(filename):
    df = pd.read_csv(filename)
    df = df[df.Phrase.isin(phrase)]
    df['log'] = df.Phrase.apply(lambda x: phrase_ratio[x])
    df = df.groupby(by = ['ID'])['log'].sum().reset_index(name = 'Ratio')
    return df

In [None]:
df_ratio = calc_comp_score('phrase.csv')
#df_ratio = pd.merge(df_recall, df_ratio, how = 'left', on = 'ID')
df_ratio.Ratio = df_ratio.Ratio.fillna(0)
df_ratio.to_csv('Complaints_Score_05_14.csv')