# <font color='#42AAFF'>Содержание</font><a id='toc0_'></a>  <a id='toc0_'></a>    
1. [<font color='#42AAFF'>Импорт предобработанных данных</font>](#toc1_)    
2. [<font color='#42AAFF'>Проверка признаков на мультиколлинеарность</font>](#toc2_)    
3. [<font color='#42AAFF'>Оптимизация FAISS</font>](#toc3_)    
3.1. [<font color='#42AAFF'>Обучение индексов при разном количестве кластеров</font>](#toc3_1_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=true
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import pandas as pd
import numpy as np
import faiss
import time
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# 1. <a id='toc1_'></a>[<font color='#42AAFF'>Импорт предобработанных данных</font>](#toc0_)

In [2]:
features_base = pd.read_csv('features_base.csv',index_col=0)
features_train = pd.read_csv('features_train.csv',index_col=0)
features_test = pd.read_csv('features_test.csv',index_col=0)
target_train = pd.read_csv('target_train.csv',index_col=0)
target_test = pd.read_csv('target_test.csv',index_col=0)

Колонки с признаками, имеющими явно не стандартные распределения:

In [3]:
cols1 = ['6', '21', '25', '33', '44', '59', '65', '70']

# 2. <a id='toc2_'></a>[<font color='#42AAFF'>Проверка признаков на мультиколлинеарность</font>](#toc0_)

Проверим исходную базу на коллинеарность признаков с помощью Variance Inflation Factor:

In [4]:
def variance_inflation_factors(x):
    model = LinearRegression()
    vif = pd.DataFrame(columns=['features','vif'])
    for i in range(x.shape[1]):        
        x_i = x.drop(x.columns[i], axis=1)
        y_i = x[x.columns[i]]
        model_i = model.fit(x_i, y_i)
        y_i_pred = model_i.predict(x_i)        
        r2_i = r2_score(y_i, y_i_pred)        
        vif_i = 1 / (1 - r2_i)
        vif.loc[i,'features'] = x.columns[i]
        vif.loc[i,'vif'] = vif_i
        vif = vif.sort_values(by='vif',ascending=False).reset_index(drop=True)
    return vif

Если VIF>5 то будем считать признак коллинеарным, определим VIF:

In [5]:
variance_inflation_factors(features_base)

Unnamed: 0,features,vif
0,63,1.229038
1,35,1.147099
2,57,1.102206
3,10,1.093029
4,45,1.0893
...,...,...
67,6,1.019902
68,18,1.019497
69,12,1.017122
70,65,1.00188


Мультиколлинеарных признаков не обнаружили.

# 3. <a id='toc3_'></a>[<font color='#42AAFF'>Оптимизация FAISS</font>](#toc0_)

## 3.1. <a id='toc3_1_'></a>[<font color='#42AAFF'>Обучение индексов при разном количестве кластеров</font>](#toc0_)

In [None]:
num, dim = features_base.shape
quantiser = faiss.IndexFlatL2(dim) 
n_clusters_list = [15, 30, 50, 100, 250, 500, 1000, 2000, 3000]

Обучим индексы при разном количестве кластеров и сохраним результаты:

In [None]:
%%time
for j,n_clusters in enumerate(n_clusters_list):
    index = faiss.IndexIVFFlat(quantiser, dim, n_clusters)
    index.train(np.ascontiguousarray(features_base.values).astype('float32'))
    index.add(np.ascontiguousarray(features_base.values).astype('float32'))
    faiss.write_index(index, 'index_'+ str(n_clusters))

Wall time: 40min 18s


## 4. <a id='toc3_'></a>[<font color='#42AAFF'>Влияние доли кластеров поиска</font>](#toc0_)

In [4]:
def faiss_accuracy(features_base,base_index,features_train,target_train,n_clusters_search,n_samples,index):    
    start_time = time.time()    
    index.nprobe = n_clusters_search
    vecs, idx = index.search(np.ascontiguousarray(features_train.values).astype('float32'), n_samples)
    t_search = time.time() - start_time
    acc = 0
    for target, el in zip(target_train[target_train.columns[0]].values.tolist(), idx.tolist()):
        acc += int(target in [base_index[r] for r in el])
    acc = 100 * acc / len(idx) 
    return acc, t_search  

In [5]:
results = pd.DataFrame(columns=['n_clusters','n_clusters_search','n_clusters_ratio','n_samples','time_search','accuracy'])
n_clusters_list = [15, 100, 250, 500, 1000, 2000, 3000]
n_clusters_ratio = [0.1, 0.2, 0.5]
n_samples_list = [5, 200, 500]

ind=0;
for n_clusters in n_clusters_list:
    index = faiss.read_index('index_'+ str(n_clusters))
    base_index = {k: v for k, v in enumerate(features_base.index.to_list())}
    for ratio in n_clusters_ratio:
        n_clusters_search = int(np.ceil(ratio*n_clusters))
        for n_samples in n_samples_list:
            acc, t_search  = faiss_accuracy(
                features_base, base_index,features_train,target_train,n_clusters_search,n_samples,index)            
            results.loc[ind,'n_clusters'] = n_clusters
            results.loc[ind,'n_clusters_search'] = n_clusters_search
            results.loc[ind,'n_clusters_ratio'] = ratio
            results.loc[ind,'n_samples'] = n_samples            
            results.loc[ind,'accuracy'] = acc
            results.loc[ind,'time_search'] = t_search  
            display(results[ind:ind+1])    
            ind += 1 

Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
0,15,2,0.1,5,425.775919,54.554


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
1,15,2,0.1,200,429.399675,65.486


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
2,15,2,0.1,500,424.060921,67.148


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
3,15,3,0.2,5,630.815978,58.191


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
4,15,3,0.2,200,631.957546,70.751


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
5,15,3,0.2,500,633.348987,72.674


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
6,15,8,0.5,5,1676.106068,62.962


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
7,15,8,0.5,200,1679.16032,78.631


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
8,15,8,0.5,500,1676.599332,81.137


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
9,100,10,0.1,5,332.530909,62.736


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
10,100,10,0.1,200,332.916333,77.699


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
11,100,10,0.1,500,333.572543,80.007


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
12,100,20,0.2,5,661.881267,63.688


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
13,100,20,0.2,200,662.131711,79.495


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
14,100,20,0.2,500,663.006946,81.955


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
15,100,50,0.5,5,1599.131043,64.062


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
16,100,50,0.5,200,1590.229341,80.533


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
17,100,50,0.5,500,1599.798427,83.181


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
18,250,25,0.1,5,298.879975,63.45


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
19,250,25,0.1,200,299.051863,78.963


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
20,250,25,0.1,500,299.629883,81.359


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
21,250,50,0.2,5,590.074653,63.889


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
22,250,50,0.2,200,605.939231,79.886


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
23,250,50,0.2,500,609.256034,82.381


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
24,250,125,0.5,5,1490.793612,64.066


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
25,250,125,0.5,200,1491.762763,80.537


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
26,250,125,0.5,500,1499.61078,83.192


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
27,500,50,0.1,5,284.14994,63.734


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
28,500,50,0.1,200,284.634202,79.409


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
29,500,50,0.1,500,284.899814,81.817


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
30,500,100,0.2,5,564.972622,63.964


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
31,500,100,0.2,200,576.57486,80.034


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
32,500,100,0.2,500,562.114128,82.539


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
33,500,250,0.5,5,1434.929402,64.076


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
34,500,250,0.5,200,1432.647104,80.574


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
35,500,250,0.5,500,1429.916935,83.232


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
36,1000,100,0.1,5,281.826622,63.797


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
37,1000,100,0.1,200,282.342471,79.6


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
38,1000,100,0.1,500,282.702191,82.042


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
39,1000,200,0.2,5,562.421916,63.99


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
40,1000,200,0.2,200,564.778695,80.131


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
41,1000,200,0.2,500,567.041052,82.669


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
42,1000,500,0.5,5,1422.851629,64.083


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
43,1000,500,0.5,200,1426.819971,80.609


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
44,1000,500,0.5,500,1423.191818,83.275


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
45,2000,200,0.1,5,280.341469,63.869


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
46,2000,200,0.1,200,274.319944,79.789


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
47,2000,200,0.1,500,274.320039,82.231


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
48,2000,400,0.2,5,547.640897,64.0


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
49,2000,400,0.2,200,547.907029,80.196


Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
50,2000,400,0.2,500,548.360596,82.735


KeyboardInterrupt: 

In [6]:
results.to_csv('results')

In [7]:
results.head(5)

Unnamed: 0,n_clusters,n_clusters_search,n_clusters_ratio,n_samples,time_search,accuracy
0,15,2,0.1,5,425.775919,54.554
1,15,2,0.1,200,429.399675,65.486
2,15,2,0.1,500,424.060921,67.148
3,15,3,0.2,5,630.815978,58.191
4,15,3,0.2,200,631.957546,70.751
