In [1]:

import pandas as pd
import numpy as np
import math


In [104]:

#### Reading the training properties file ####
training_properties = pd.read_csv("Training_Properties.csv", low_memory=False)
training_properties['word'] = training_properties['word'].str.lower()
training_properties = training_properties.drop_duplicates(['word'],keep='first')

#### Reading the Clusters.csv ####
training_clusters = pd.read_csv("Training_Clusters.csv", low_memory=False)


#### Merging the clusters and training to include only the centroids as a part of training set ####
training_final = pd.merge(training_properties, training_clusters, how = 'left', left_on=['word','source'], right_on = ['word','source'])
training_final = training_final.dropna()
training_final.shape

(2144, 9)

In [105]:

#### Processing the data - adding features ####

#### Adding tf_idf ( term frequency - inverse document frequency ) to the training set ####
doc_count = 3332956
training_final = training_final.fillna(1)
training_final['volume_count'] = pd.to_numeric(training_final.volume_count, errors='coerce')
training_final['tf_idf'] =  training_final['frequency']/training_final['wordCount'] 
training_final['tf_idf'] =  training_final['tf_idf']*(training_final['volume_count'].apply(lambda x: math.log(doc_count/(2+x),10)))


#### Adding the pos_rank - rank for every parts of speech ####
training_final['pos_rank'] = 2
training_final.loc[training_final.pos == 'NN' , 'pos_rank'] = 14
training_final.loc[training_final.pos == 'JJ' , 'pos_rank' ] = 7
training_final.loc[training_final.pos == 'NNS' , 'pos_rank'] = 6
training_final.loc[training_final.pos == 'NNP' , 'pos_rank' ] = 6
training_final.loc[training_final.pos == 'VBP' , 'pos_rank' ] = 3


In [106]:

#### Scoring the training properties ####
training_final['score'] = 0
for index,row in training_final.iterrows():
    score = ((row['pos_rank'] * row['n_gram_score']))*row['tf_idf'] 
    training_final.loc[index , 'score'] = score
    
training_final = training_final.sort_values(['score'], ascending=False)
training_final_2 = training_final.copy()
training_final.shape

(2144, 12)

In [107]:
#### MODEL 1 : Predicting using the clustering and scoring function ####

res1_file1 = training_final.loc[training_final.source == 1]
res5_file5 = training_final.loc[training_final.source == 5]


In [108]:

#### Evaluating MODEL 1 ####
def match_count(predicted, actual):
    count = 0
    pred_copy = predicted
    act_copy = actual
    for str in predicted:
        if (actual.count(str) >= 1):
            count = count + 1
            act_copy.remove(str)
            pred_copy.remove(str)
    left_pred = pred_copy
    left_act = act_copy[:]
    match = False
    for index in act_copy:
            for str in pred_copy:
                if(index.find(str) != -1):
                    count = count + 1
                    match = True
                    left_pred.remove(str)
                    
            if match:
                left_act.remove(index)
    list = [count, len(pred_copy),len(act_copy)]
    return list


In [109]:
actual_indices_5 = Indexes_Actual.loc[Indexes_Actual.source == 5]
pred_file5 = match_count(list(res5_file5['word'][:444].values),list(actual_indices_5['word'].values) )
pred_file5

[133, 311, 439]

In [110]:
actual_indices_1 = Indexes_Actual.loc[Indexes_Actual.source == 1]
pred_file1 = match_count(list(res1_file1['word'][:22].values),list(actual_indices_1['word'].values) )
pred_file1

[7, 15, 22]

In [111]:
#### Finding/Matching the indices in the training set ####

def match_count1(predicted, actual):
    count = 0
    index_words = []
    pred_copy = predicted
    act_copy = actual
    for str in predicted:
        if (actual.count(str) >= 1):
            count = count + 1
            act_copy.remove(str)
            pred_copy.remove(str)
            index_words.append(str)
    left_pred = pred_copy
    left_act = act_copy[:]
    match = False
    for index in act_copy:
            for str in pred_copy:
                if(index.find(str) != -1):
                    count = count + 1
                    match = True
                    left_pred.remove(str)
                    index_words.append(str)
            if match:
                left_act.remove(index)
    list = [count, len(pred_copy),len(act_copy),index_words]
    return list


In [112]:

#### Reading the actual indices of the files used for training ####
Indexes_Actual = pd.read_csv("Training_Indexes.csv", low_memory=False)

res = match_count1(list(training_final['word'].values) , list(Indexes_Actual['word'].values))


def is_index():
    training_final['is_index'] = 0
    for item in res[3]:
        training_final.loc[training_final.word == item  , 'is_index'] = 1
        
is_index()


In [120]:

#### MODEL 2 : Logistic Regression ####

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
logistic = LogisticRegression(class_weight = "balanced")
training_features = ['score','tf_idf']
target = ['is_index']
train_x , test_x, train_y , test_y = train_test_split(training_final[training_features] , training_final[target] , train_size=0.8)
logistic.fit(train_x,train_y)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [121]:

#### Predicting for new files ####
accuracy = logistic.score(train_x , train_y)
accuracy

0.75568513119533531

In [122]:
accuracy = logistic.score(test_x , test_y)
accuracy

0.76223776223776218

In [123]:
#### Evaluating MODEL 2 ####

circular = training_final.loc[training_final_2.source == 1]

training_features = ['score','tf_idf']
test_x = circular[training_features]
pred_y = logistic.predict(test_x)
circular['is_index_pred'] = pred_y
inds = circular.loc[circular.is_index_pred == 1]
inds = inds.drop_duplicates(['word'],keep='first')
inds = inds.sort_values(['score'], ascending=False)
inds['word'][:23]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


48          unitary conjugation
43                matrix convex
10                free circular
63                linear pencil
139                 pencil ball
105             circular matrix
140           reducing subspace
33                  cross terms
252                  monic free
112           free spectrahedra
92                   direct sum
71         unitarily equivalent
141          separation theorem
47                 spectrahedra
111            free polynomials
1                        matrix
257    orthogonal decomposition
126               circular free
19                     subspace
189       relatively irrational
65            positive integers
166                 free matrix
366               free analysis
Name: word, dtype: object

In [124]:
actual_indices_1 = Indexes_Actual.loc[Indexes_Actual.source == 1]
res_1 = match_count(list(inds['word'][:22].values) , list(actual_indices_1['word'].values))
res_1

[7, 15, 22]

In [126]:
matters_of_time = training_final.loc[training_final_2.source == 5]

training_features = ['score','tf_idf']
test_x = matters_of_time[training_features]
pred_y = logistic.predict(test_x)
matters_of_time['is_index_pred'] = pred_y
inds = matters_of_time.loc[matters_of_time.is_index_pred == 1]
inds = inds.drop_duplicates(['word'],keep='first')
inds = inds.sort_values(['score'], ascending=False)
inds.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(99, 14)

In [119]:
actual_indices_5 = Indexes_Actual.loc[Indexes_Actual.source == 5]
res_5 = match_count(list(inds['word'][:110].values) , list(actual_indices_5['word'].values))
res_5

[55, 55, 442]

In [159]:
#### MODEL 3 : Random Forest Regression ####

## Using Random forests
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
regr = RandomForestRegressor(max_depth=10, random_state=0)

training_features = ['tf_idf','score','n_gram_score']
target = ['is_index']
train_x , test_x, train_y , test_y = train_test_split(training_final[training_features] , training_final[target] , train_size=0.8)
regr.fit(train_x,train_y)
accuracy = regr.score(train_x , train_y)
accuracy


  # This is added back by InteractiveShellApp.init_path()


0.52529474693068834

In [160]:
accuracy = regr.score(test_x , test_y)
accuracy

0.089995637571913423

In [161]:
#### Evaluating MODEL 3 ####
training_features = ['tf_idf','score','n_gram_score']
test_x = circular[training_features]
pred_y = regr.predict(test_x)
circular['is_index_pred'] = pred_y
inds = circular.loc[circular.is_index_pred == 1]
inds['word']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


48    unitary conjugation
43          matrix convex
10          free circular
Name: word, dtype: object

In [162]:
training_features = ['tf_idf','score','n_gram_score']
test_x = matters_of_time[training_features]
pred_y = regr.predict(test_x)
matters_of_time['is_index_pred'] = pred_y
inds = matters_of_time.loc[matters_of_time.is_index_pred == 1]
inds['word']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


7539          positive energy
7547    relational definition
7536          negative energy
7555        discrete symmetry
7542      gravitational field
7550            negative mass
7549         direction degree
7534            gravitational
Name: word, dtype: object

In [189]:
from sklearn import tree
clf1 = tree.DecisionTreeRegressor(max_depth=8)
training_features = ['score']
target = ['is_index']
train_x , test_x, train_y , test_y = train_test_split(training_final[training_features] , training_final[target] , train_size=0.8)
clf1.fit(train_x,train_y)
accuracy = clf1.score(train_x , train_y)
accuracy

0.21798718332160982

In [190]:
accuracy = clf1.score(test_x , test_y)
accuracy

-0.070663737585713982

In [188]:
#### Evaluating MODEL 3 ####
training_features = ['score']
test_x = circular[training_features]
pred_y = clf1.predict(test_x)
circular['is_index_pred'] = pred_y
inds = circular.loc[circular.is_index_pred == 1]
inds['word']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


10     free circular
246        isometric
69              sums
67           section
359        equations
148           define
Name: word, dtype: object