In [10]:
import pandas as pd
import spacy
import textacy
from pyate import combo_basic, cvalues, term_extractor, basic

In [50]:
#Read corpus generated by the Corpus class
corpus_df = pd.read_csv('../data/corpus/corpus_augmented_backtranslation.csv')
#Read classification results
best_classification_df = pd.read_csv('../data/evaluation/best_classiciation.csv',index_col='index')
best_classification_df['y_pred'] = best_classification_df['y_pred'].fillna('Unlabeled')
best_classification_df['y_pred'] = best_classification_df['y_pred'].astype(str)

In [51]:
best_classification_indexonly = best_classification_df['y_pred']
groupkey_to_label = best_classification_indexonly.to_dict()

In [52]:
corpus_df['label'] = corpus_df['group_key']
corpus_df = corpus_df.replace({'label': groupkey_to_label})

In [53]:
#Filter corpus to products in evaluation set
corpus_df['feature_descriptor'] = corpus_df['feature_descriptor'].apply(lambda x: x.split('.')[0])
corpus_df = corpus_df[corpus_df['label'].isin(['community-cloud','service-cloud','sales-cloud','salesforce-quote-to-cash','commerce-cloud'])]

In [6]:
def get_terms(corpus_df, K=None,max_doc_depth=None):
    nlp = spacy.load("en_core_web_sm")
    corpus_df = corpus_df[corpus_df['label']!='Unlabeled']
    if max_doc_depth:
        corpus_df = corpus_df[corpus_df['intra_doc_depth']<=max_doc_depth]
    grouped = corpus_df.groupby('label').agg(descriptor_text=('feature_descriptor', ', '.join))
    grouped['descriptor_text'] = grouped['descriptor_text'].apply(lambda x: x[:1000000] if len(x)>1000000 else x)
    #Set K to 20 if not provided
    if not K:
        K=20

    grouped['combo_basic']=grouped['descriptor_text'].map(lambda x: list(combo_basic(x).nlargest(K).to_dict().keys()))

    grouped['basic']=grouped['descriptor_text'].map(lambda x: list(basic(x).nlargest(K).to_dict().keys()))

    grouped['cvalues']=grouped['descriptor_text'].map(lambda x: list(cvalues(x).nlargest(K).to_dict().keys()))

    #grouped['term_extractor']=grouped['descriptor_text'].map(lambda x: list(term_extractor(x).nlargest(K).to_dict().keys()))

    #grouped['textrank']=grouped['descriptor_text'].map(lambda x: list(dict(textacy.extract.keyterms.textrank(nlp(x),window_size=2, edge_weighting="binary", normalize="lower", include_pos=('NOUN','PROPN','ADJ','VERB','ADP','ADJ'), position_bias=False,topn=K)).keys()))

    grouped['singlerank']=grouped['descriptor_text'].map(lambda x: list(dict(textacy.extract.keyterms.textrank(nlp(x),window_size=10, edge_weighting="count",normalize="lower", include_pos=('NOUN','PROPN','ADJ','VERB','ADP','ADJ'), position_bias=False,topn=K)).keys()))

    grouped['positionrank']=grouped['descriptor_text'].map(lambda x: list(dict(textacy.extract.keyterms.textrank(nlp(x),window_size=10, edge_weighting="count",normalize="lower", include_pos=('NOUN','PROPN','ADJ','VERB','ADP','ADJ'), position_bias=True,topn=K)).keys()))

    grouped['yake']=grouped['descriptor_text'].map(lambda x: list(dict(textacy.extract.keyterms.yake(nlp(x),window_size=10,normalize="lower", include_pos=('NOUN','PROPN','ADJ','VERB','ADP','ADJ'),topn=K)).keys()))

    #grouped['sckae']=grouped['descriptor_text'].map(lambda x: textacy.extract.keyterms.scake(nlp(x),normalize="lower", include_pos=('NOUN','PROPN','ADJ','VERB','ADP','ADJ'),topn=K))

    #grouped['sgrank']=grouped['descriptor_text'].map(lambda x: textacy.extract.keyterms.sgrank(nlp(x),normalize="lower", include_pos=('NOUN','PROPN','ADJ','VERB','ADP','ADJ'),topn=K))
    return grouped

In [33]:
#Extract top K terms...
K15=get_terms(corpus_df,15,2)

In [34]:
K24=get_terms(corpus_df,24,2)
K34=get_terms(corpus_df,34,2)

In [36]:
#Drop column with text and keep label and terms only
K15=K15.drop(['descriptor_text'], axis=1)
K24=K24.drop(['descriptor_text'], axis=1)
K34=K34.drop(['descriptor_text'], axis=1)
#..and save

KeyError: "['descriptor_text'] not found in axis"

In [37]:
K15.to_csv('../data/evaluation/K15.csv')
K24.to_csv('../data/evaluation/K24.csv')
K34.to_csv('../data/evaluation/K34.csv')

In [7]:
import sklearn
import numpy as np
from sklearn import feature_selection
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,3))
corpus = corpus_df['feature_descriptor_clean']
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
y = corpus_df["label"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.97
dtf_features = pd.DataFrame()
for cat in np.unique(y):
    chisq, p = feature_selection.chi2(X_train, y==cat)
    dtf_features = dtf_features.append(pd.DataFrame(
                   {"feature":X_names, "score":1-p, "y":cat}))
    dtf_features = dtf_features.sort_values(["y","score"],
                    ascending=[True,False])
    dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
X_names = dtf_features["feature"].unique().tolist()

In [14]:
for cat in np.unique(y):
   print("# {}:".format(cat))
   print("  . selected features:",
         len(dtf_features[dtf_features["y"]==cat]))
   print("  . top features:", ",".join(
dtf_features[dtf_features["y"]==cat]["feature"].values[:10]))
   print(" ")


# Unlabeled:
  . selected features: 721
  . top features: 360 data,360 data manager,approval,approval process,branded app,change set,chatter,clean,com,com clean
 
# cms:
  . selected features: 204
  . top features: any,banner,block,builder,builder create,classic content,cloud content,cloud content builder,cloudpages,cms
 
# commerce-cloud:
  . selected features: 562
  . top features: 20,20 release,account group,b2b,b2b commerce,b2b commerce visualforce,b2c,b2c commerce,behalf,bot enabled
 
# community-cloud:
  . selected features: 656
  . top features: access support documentation,advanced sharing,advanced sharing establish,assertion,authentication,authentication provider,clients partners,communities,community,community members
 
# consumer-goods-cloud:
  . selected features: 242
  . top features: accuracy,action plan template,analytics consumer,analytics consumer goods,assessment task,assortment,audit,books,consumer,consumer goods
 
# crm-analytics:
  . selected features: 463
  . top 

In [54]:
corpus_df = corpus_df[corpus_df['intra_doc_depth']<=3]
grouped = corpus_df.groupby('label').agg(descriptor_text=('feature_descriptor', ', '.join))
grouped['descriptor_text'] = grouped['descriptor_text'].apply(lambda x: x[:1000000] if len(x)>1000000 else x)

In [25]:
grouped['descriptor_text'][0]



In [63]:
import yake

custom_kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.8, dedupFunc="seqm", windowsSize=1, top=35)
keywords = custom_kw_extractor.extract_keywords(grouped['descriptor_text'][4])

for kw in keywords:
    print(kw)

('Service Console', 4.304653539967347e-07)
('Web Services', 6.122775927730627e-07)
('Services API', 6.620870348083024e-07)
('Case Management', 8.648365118898064e-07)
('Phone Integration', 1.1896415809783799e-06)
('Customer Service', 1.2009332339718195e-06)
('Apps Integration', 1.2195241333854526e-06)
('Mobile App', 1.3658958047627665e-06)
('Console App', 1.4140734419657023e-06)
('App Builder', 1.5645294119315964e-06)
('Lightning App', 1.7942695942016772e-06)
('Google Apps', 1.9102447943493854e-06)
('Configuration Services', 2.1338040637538473e-06)
('App Service', 2.2775597948809355e-06)
('Service Contracts', 2.2857847607329965e-06)
('Advanced Case', 2.5269270780301574e-06)
('Full Sandbox', 2.6027725466229754e-06)
('Pro Sandbox', 2.614620414686855e-06)
('Developer Sandbox', 2.992771754737386e-06)
('Partial Sandbox', 3.268275518358567e-06)
('Salesforce Mobile', 3.3925717187285754e-06)
('Integration App', 3.6585724001563583e-06)
('Flow Automation', 3.840419416901747e-06)
('Salesforce Iden