In [1]:

import pandas as pd
import numpy as np
import math


In [63]:

#### Reading the training properties file ####
training_properties = pd.read_csv("Training_Properties.csv", low_memory=False)


#### Reading the Clusters.csv ####
training_clusters = pd.read_csv("Training_Clusters.csv", low_memory=False)

training_final['word'] = training_final['word'].str.lower()
training_final = training_final.drop_duplicates(['word'],keep='first')

#### Merging the clusters and training to include only the centroids as a part of training set ####
training_final = pd.merge(training_properties, training_clusters, how = 'left', left_on=['word','source'], right_on = ['word','source'])
training_final = training_final.dropna()
training_final.shape

(5412, 9)

In [64]:

#### Processing the data - adding features ####

#### Adding tf_idf ( term frequency - inverse document frequency ) to the training set ####
doc_count = 3332956
training_final = training_final.fillna(1)
training_final['volume_count'] = pd.to_numeric(training_final.volume_count, errors='coerce')
training_final['tf_idf'] =  training_final['frequency']/training_final['wordCount'] 
training_final['tf_idf'] =  training_final['tf_idf']*(training_final['volume_count'].apply(lambda x: math.log(doc_count/(2+x),10)))


#### Adding the pos_rank - rank for every parts of speech ####
training_final['pos_rank'] = 2
training_final.loc[training_final.pos == 'NN' , 'pos_rank'] = 14
training_final.loc[training_final.pos == 'JJ' , 'pos_rank' ] = 7
training_final.loc[training_final.pos == 'NNS' , 'pos_rank'] = 6
training_final.loc[training_final.pos == 'NNP' , 'pos_rank' ] = 6
training_final.loc[training_final.pos == 'VBP' , 'pos_rank' ] = 3


In [65]:

#### Scoring the training properties ####
training_final['score'] = 0
for index,row in training_final.iterrows():
    score = ((row['pos_rank'] * row['n_gram_score']))*row['tf_idf'] 
    training_final.loc[index , 'score'] = score
    
training_final = training_final.sort_values(['score'], ascending=False)
training_final_2 = training_final.copy()
training_final.shape

(5412, 12)

In [81]:
#### MODEL 1 : Predicting using the clustering and scoring function ####


In [None]:

#### Evaluating MODEL 1 ####
def match_count(predicted, actual):
    count = 0
    pred_copy = predicted
    act_copy = actual
    for str in predicted:
        if (actual.count(str) >= 1):
            count = count + 1
            act_copy.remove(str)
            pred_copy.remove(str)
    left_pred = pred_copy
    left_act = act_copy[:]
    match = False
    for index in act_copy:
            for str in pred_copy:
                if(index.find(str) != -1):
                    count = count + 1
                    match = True
                    left_pred.remove(str)
                    
            if match:
                left_act.remove(index)
    list = [count, len(pred_copy),len(act_copy)]
    return list


In [66]:
#### Finding/Matching the indices in the training set ####

def match_count1(predicted, actual):
    count = 0
    index_words = []
    pred_copy = predicted
    act_copy = actual
    for str in predicted:
        if (actual.count(str) >= 1):
            count = count + 1
            act_copy.remove(str)
            pred_copy.remove(str)
            index_words.append(str)
    left_pred = pred_copy
    left_act = act_copy[:]
    match = False
    for index in act_copy:
            for str in pred_copy:
                if(index.find(str) != -1):
                    count = count + 1
                    match = True
                    left_pred.remove(str)
                    index_words.append(str)
            if match:
                left_act.remove(index)
    list = [count, len(pred_copy),len(act_copy),index_words]
    return list


In [76]:

#### Reading the actual indices of the files used for training ####
Indexes_Actual = pd.read_csv("Training_Indexes.csv", low_memory=False)

res = match_count1(list(training_final['word'].values) , list(Indexes_Actual['word'].values))


def is_index():
    training_final['is_index'] = 0
    for item in res[3]:
        training_final.loc[training_final.word == item  , 'is_index'] = 1
        
is_index()


In [68]:

#### MODEL 2 : Logistic Regression ####

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
logistic = LogisticRegression(class_weight = "balanced")
training_features = ['score']
target = ['is_index']
train_x , test_x, train_y , test_y = train_test_split(training_final[training_features] , training_final[target] , train_size=0.8)
logistic.fit(train_x,train_y)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [69]:

#### Predicting for new files ####
accuracy = logistic.score(train_x , train_y)
accuracy

0.64495264495264493

In [70]:
accuracy = logistic.score(test_x , test_y)
accuracy

0.63619575253924288

In [79]:
#### Evaluating MODEL 2 ####

light_nuclei = training_final.loc[training_final_2.source == 1]

training_features = ['score']
test_x = light_nuclei[training_features]
pred_y = logistic.predict(test_x)
light_nuclei['is_index_pred'] = pred_y
inds = light_nuclei.loc[light_nuclei.is_index_pred == 1]
inds = inds.drop_duplicates(['word'],keep='first')
inds = inds.sort_values(['score'], ascending=False)
inds['word'][:23]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


51          unitary conjugation
46                matrix convex
10                free circular
70                linear pencil
163                 pencil ball
120             circular matrix
165           reducing subspace
35                  cross terms
303                  monic free
130           free spectrahedra
102                  direct sum
78         unitarily equivalent
166          separation theorem
50                 spectrahedra
129            free polynomials
189               circular free
309             operator pencil
282                 free matrix
1                        matrix
310    orthogonal decomposition
20                     subspace
225       relatively irrational
72            positive integers
Name: word, dtype: object

In [None]:
#### MODEL 3 : Random Forest Regression/classifier ####

## Using Random forests
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
regr = RandomForestRegressor(max_depth=10, random_state=0)

training_features = ['tf_idf','score','n_gram_score']
target = ['is_index']
train_x , test_x, train_y , test_y = train_test_split(merged_circular[training_features] , merged_circular[target] , train_size=0.8)
regr.fit(train_x,train_y)



In [None]:
#### Evaluating MODEL 3 ####

