<a href="https://colab.research.google.com/github/syhennie/data-quality-process/blob/main/features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Настройка окружения

In [None]:
%pip install numpy gensim pandas scipy



In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## Предобучение модели векторизации

In [None]:
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath

# Set file names for train and test data
corpus_file = datapath('lee_background.cor')

model = FastText(vector_size=300)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

with open("fasttext_lee_background", "w+") as file:
    model.save(file.name)

## Загрузка и предобработка данных

In [None]:
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [None]:
df = df.iloc[:, 1:]
df.dropna()
df

Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."
...,...,...
11485,Our young Earth may have collided with a body ...,Oxford scientists say a Mercury-like body stru...
11486,A man facing trial for helping his former love...,Man accused of helping former lover kill woman...
11487,A dozen or more metal implements are arranged ...,Marianne Power tried the tuning fork facial at...
11488,Brook Lopez dominated twin brother Robin with ...,Brooklyn Nets beat the Portland Trail Blazers ...


## Извлечение свойств из набора данных

In [None]:
from gensim.models import Word2Vec, FastText
from gensim.utils import simple_preprocess

model = FastText.load("fasttext_lee_background")


def get_vectorised_entries(entries):
    features = []
    for entry in entries:
        tokens = simple_preprocess(entry)
        vectors = [model.wv[token] for token in tokens]
        features.append(np.mean(vectors, axis=0))
    return np.array(features)

In [None]:
summary_stats = []
features_data = []

for column in df.columns:
    entries = [x for x in df[column]]
    features = get_vectorised_entries(entries)

    mean = np.mean(features, axis=0)
    standard_deviation = np.std(features, axis=0)
    median = np.median(features, axis=0)
    asymmetry = stats.skew(features, axis=0)
    excess = stats.kurtosis(features, axis=0)

    summary_stats.append({
        'column': column,
        'overall_mean': np.mean(mean),
        'overall_std': np.mean(standard_deviation),
        'std_of_means': np.std(mean),
        'mean_of_medians': np.mean(median),
        'asymmetry_avg': np.mean(asymmetry),
        'excess_avg': np.mean(excess),
        'n_entries': len(entries),
        'vector_dim': features.shape[1]
    })

    features_data.append({
            'column': column,
            'vectors': features
        })

summary_df = pd.DataFrame(summary_stats)
summary_df

Unnamed: 0,column,overall_mean,overall_std,std_of_means,mean_of_medians,asymmetry_avg,excess_avg,n_entries,vector_dim
0,article,-0.016539,0.00661,0.203823,-0.016595,0.034912,4.487738,11490,300
1,highlights,-0.015794,0.013166,0.194579,-0.015867,0.011107,0.363758,11490,300


## Генерация синтетического набора данных

In [None]:
def generate_synthetic_vectors(summary_df, method):
    synthetic_data = []

    for _, row in summary_df.iterrows():
        n_samples = row['n_entries']
        vector_dim = row['vector_dim']

        if method == 'normal':
            mean_vector = np.full(vector_dim, row['overall_mean'])
            cov_matrix = np.eye(vector_dim) * (row['std_of_means'] ** 2)

            synthetic_vectors = stats.multivariate_normal.rvs(
                mean=mean_vector,
                cov=cov_matrix,
                size=n_samples
            )

        elif method == 'statistical_adjustment':
            base_vectors = np.random.normal(
                loc=row['overall_mean'],
                scale=row['overall_std'],
                size=(n_samples, vector_dim)
            )

            synthetic_vectors = adjust_statistics(
                base_vectors,
                target_mean=row['overall_mean'],
                target_std=row['overall_std'],
                target_skew=row['asymmetry_avg'],
                target_kurt=row['excess_avg']
            )

        synthetic_data.append({
            'column': row['column'],
            'vectors': synthetic_vectors,
            'original_stats': row
        })

    return synthetic_data


def adjust_statistics(vectors, target_mean, target_std, target_skew, target_kurt):
    vectors = vectors - np.mean(vectors, axis=0)    # центрирование
    current_std = np.std(vectors, axis=0)   # маштаб
    vectors = vectors * (target_std / (current_std + 1e-8))
    vectors = vectors + target_mean     # сдвиг

    return vectors

In [None]:
def validate_synthetic_data(synthetic_data):
    validation_results = []

    for data in synthetic_data:
        vectors = data['vectors']
        original_stats = data['original_stats']

        synth_mean = np.mean(vectors, axis=0)
        synth_std = np.std(vectors, axis=0)
        synth_skew = stats.skew(vectors, axis=0)
        synth_kurt = stats.kurtosis(vectors, axis=0)

        validation_results.append({
            'column': original_stats['column'],
            'original_mean': original_stats['overall_mean'],
            'synthetic_mean': np.mean(synth_mean),
            'mean_error': np.abs(np.mean(synth_mean) - original_stats['overall_mean']),

            'original_std': original_stats['overall_std'],
            'synthetic_std': np.mean(synth_std),
            'std_error': np.abs(np.mean(synth_std) - original_stats['overall_std']),

            'original_skew': original_stats['asymmetry_avg'],
            'synthetic_skew': np.mean(synth_skew),
            'skew_error': np.abs(np.mean(synth_skew) - original_stats['asymmetry_avg']),

            'original_kurt': original_stats['excess_avg'],
            'synthetic_kurt': np.mean(synth_kurt),
            'kurt_error': np.abs(np.mean(synth_kurt) - original_stats['excess_avg']),

            'n_samples': len(vectors),
            'vector_dim': vectors.shape[1]
        })

    return pd.DataFrame(validation_results)

In [None]:
methods = ['normal', 'statistical_adjustment']
results = {}

for method in methods:
    print(f"{method}")
    synthetic_data = generate_synthetic_vectors(summary_df, method=method)
    validation_df = validate_synthetic_data(synthetic_data)


    results[method] = {
        'data': synthetic_data,
        'validation': validation_df
    }
    print(validation_df[['column', 'mean_error', 'std_error', 'skew_error', 'kurt_error']])

normal
       column  mean_error  std_error  skew_error  kurt_error
0     article    0.000084   0.197248    0.037644    4.492911
1  highlights    0.000094   0.181360    0.012604    0.366057
statistical_adjustment
       column  mean_error     std_error  skew_error  kurt_error
0     article         0.0  1.000502e-08    0.035226    4.487348
1  highlights         0.0  1.000050e-08    0.012739    0.366742


In [None]:
synthetic_data

[{'column': 'article',
  'vectors': array([[-0.02918556, -0.00241555, -0.01755176, ..., -0.01850015,
          -0.02370537, -0.02502471],
         [-0.02091651, -0.0143409 , -0.02476307, ..., -0.01849512,
          -0.01591163, -0.0140214 ],
         [-0.01054276, -0.01340535, -0.01422865, ..., -0.01673924,
          -0.00102861, -0.02292433],
         ...,
         [-0.0035998 , -0.01470775, -0.01273879, ..., -0.01729498,
          -0.02569834, -0.00997742],
         [-0.00747992, -0.01899989, -0.00887265, ..., -0.0187878 ,
          -0.00836075, -0.01032919],
         [-0.01510822, -0.02989641, -0.01921276, ..., -0.01451408,
          -0.02706147, -0.0189524 ]]),
  'original_stats': column              article
  overall_mean      -0.016539
  overall_std         0.00661
  std_of_means       0.203823
  mean_of_medians   -0.016595
  asymmetry_avg      0.034912
  excess_avg         4.487738
  n_entries             11490
  vector_dim              300
  Name: 0, dtype: object},
 {'column':

In [None]:
features_data

[{'column': 'article',
  'vectors': array([[-0.03910081,  0.07555775, -0.19910899, ...,  0.11369136,
           0.01881464, -0.10005966],
         [-0.03758977,  0.07306214, -0.19212992, ...,  0.1098166 ,
           0.01838122, -0.09661971],
         [-0.04078785,  0.07883573, -0.20763955, ...,  0.11863095,
           0.01963237, -0.10449798],
         ...,
         [-0.03664939,  0.07096931, -0.18677235, ...,  0.10683037,
           0.0177924 , -0.09381549],
         [-0.03699807,  0.0711152 , -0.18748133, ...,  0.10705117,
           0.01771052, -0.09423301],
         [-0.04138874,  0.07998636, -0.21065971, ...,  0.12025093,
           0.01991503, -0.10597844]], dtype=float32)},
 {'column': 'highlights',
  'vectors': array([[-0.03463296,  0.06700332, -0.17614107, ...,  0.10073883,
           0.0168469 , -0.08871191],
         [-0.03376409,  0.06543773, -0.17219642, ...,  0.09823359,
           0.01640617, -0.08687474],
         [-0.03850093,  0.0737346 , -0.19477962, ...,  0.11120691