In [1]:
import os
os.chdir("../")

In [2]:
import pandas as pd
import json
import sys
sys.path.append('./src')
from data import load_data

def generate_monthly_data(df,groupby_col, date_col='date'):
    result_dfs = []
    for well, data in df.groupby(groupby_col):
        data = data.set_index(date_col)
        idx = pd.date_range(data.index.min(), data.index.max(), freq='MS')
        data = data.reindex(idx)
        data = data.reset_index()
        data = data.rename(columns={'index':date_col})
        data[groupby_col] = well
        result_dfs.append(data)
    result_df = pd.concat(result_dfs)
    return result_df

config_file = "./config.json"
with open(config_file, 'r') as f:
    config = json.load(f)
train_data_path = config["train_data_path"]
df = load_data(train_data_path) 
X = df.drop(columns='cluster')
y = df.cluster
df = (X.explode(['date','value'])
      .assign(date = lambda df: pd.to_datetime(df['date']))
      .pipe(generate_monthly_data,'well', 'date')
      )
df

Unnamed: 0,date,well,value
0,1977-03-01,well_1,13.876923
1,1977-04-01,well_1,36.375
2,1977-05-01,well_1,53.058824
3,1977-06-01,well_1,18.695652
4,1977-07-01,well_1,9.804598
...,...,...,...
331,2004-10-01,well_99,2.508772
332,2004-11-01,well_99,2.264151
333,2004-12-01,well_99,2.3
334,2005-01-01,well_99,2.346774


In [2]:
import pandas as pd
import json
import sys
sys.path.append('./src')
from data import load_data

config_file = "./config.json"
with open(config_file, 'r') as f:
    config = json.load(f)

train_data_path = config["train_data_path"]

df = load_data(train_data_path) 
X = df.drop(columns='cluster')
y = df.cluster

df = (X.explode(['date','value'])
      .assign(date = lambda df: pd.to_datetime(df['date']))
      )

def pad_groups_with_zeros(df, group_col,value_col):
    well_counts = df[group_col].value_counts()
    max_samples = well_counts.max()
    grouped = df.groupby(group_col)
    padded_dfs = []
    for name, group in grouped:
        num_samples = len(group)
        num_zeros_to_pad = max_samples - num_samples
        zeros_to_pad = pd.DataFrame({group_col: [name] * num_zeros_to_pad, value_col: [0.0] * num_zeros_to_pad})
        padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
        padded_dfs.append(padded_group)

    padded_df = pd.concat(padded_dfs, ignore_index=True)
    return padded_df

df_padded = pad_groups_with_zeros(df, 'well','value')
df_padded

Unnamed: 0,well,date,value
0,well_1,1977-03-01,13.876923
1,well_1,1977-04-01,36.375
2,well_1,1977-05-01,53.058824
3,well_1,1977-06-01,18.695652
4,well_1,1977-07-01,9.804598
...,...,...,...
182775,well_99,NaT,0.0
182776,well_99,NaT,0.0
182777,well_99,NaT,0.0
182778,well_99,NaT,0.0


In [3]:
import pandas as pd
import json
import sys
sys.path.append('./src')
from data import load_data
def round_to_nearest_multiple(num, multiple):
    return round(num / multiple) * multiple

def calculate_relative_time(df, groupby_col, date_col='date', rounded = True):
    min_dates = df.groupby(groupby_col)[date_col].min()
    df['relative_time'] = df.apply(lambda row: (row[date_col] - min_dates[row[groupby_col]]).days, axis=1)
    if rounded:
        df['relative_time']  =  round_to_nearest_multiple(df['relative_time'], 30).astype(int)
    return df

config_file = "./config.json"
with open(config_file, 'r') as f:
    config = json.load(f)
train_data_path = config["train_data_path"]
df = load_data(train_data_path) 
X = df.drop(columns='cluster')
y = df.cluster
df = (X.explode(['date','value'])
      .assign(date = lambda df: pd.to_datetime(df['date']))
      .pipe(calculate_relative_time, 'well', 'date')
      )
df

Unnamed: 0,well,date,value,relative_time
0,well_9,1977-01-01,0.939394,0
0,well_9,1977-02-01,0.965517,30
0,well_9,1977-03-01,0.0,60
0,well_9,1977-04-01,0.0,90
0,well_9,1977-05-01,0.0,120
...,...,...,...,...
369,well_575,2017-10-01,12.052015,3420
369,well_575,2017-11-01,14.237062,3450
369,well_575,2017-12-01,17.529553,3480
369,well_575,2018-01-01,17.726489,3510


In [5]:
def data_formating(X):
    exploded = X.explode(['date', 'value'])
    exploded['date'] = pd.to_datetime(exploded['date'])
    return exploded
X.pipe(data_formating)

Unnamed: 0,well,date,value
0,well_9,1977-01-01,0.939394
0,well_9,1977-02-01,0.965517
0,well_9,1977-03-01,0.0
0,well_9,1977-04-01,0.0
0,well_9,1977-05-01,0.0
...,...,...,...
369,well_575,2017-10-01,12.052015
369,well_575,2017-11-01,14.237062
369,well_575,2017-12-01,17.529553
369,well_575,2018-01-01,17.726489


In [1]:
import os
os.chdir("../")
import pandas as pd
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import sys
sys.path.append('./src')
from data import load_data
from transformations import data_formating, generate_monthly_data, pad_groups_with_zeros
from sklearn.impute import SimpleImputer
class DataFormattingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        index = X.well.drop_duplicates().to_list()
        df = data_formating(X)
        df['well'] = pd.Categorical(df['well'], categories=index, ordered=True)
        df= df.sort_values(by='well')
        return df

class GenerateMonthlyDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, groupby_col, date_col='date'):
        self.groupby_col = groupby_col
        self.date_col = date_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        index = X.well.drop_duplicates().to_list()
        df = generate_monthly_data(X, self.groupby_col, self.date_col)
        df['well'] = pd.Categorical(df['well'], categories=index, ordered=True)
        df= df.sort_values(by='well')
        return df

class PadGroupsWithZerosTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, value_col):
        self.group_col = group_col
        self.value_col = value_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        index = X.well.drop_duplicates().to_list()
        df = pad_groups_with_zeros(X, self.group_col, self.value_col)
        df['well'] = pd.Categorical(df['well'], categories=index, ordered=True)
        df= df.sort_values(by='well')
        df = df.drop(columns=['date','well'])
        return df


config_file = "./config.json"
with open(config_file, 'r') as f:
    config = json.load(f)

train_data_path = config["train_data_path"]

df = load_data(train_data_path) 

X = df.drop(columns='cluster')
y = df.cluster

clf = Pipeline([
    ('data_formatting', DataFormattingTransformer()),
    ('monthly_data_generation', GenerateMonthlyDataTransformer(groupby_col='well')),
    ('pad_with_zeros', PadGroupsWithZerosTransformer(group_col='well', value_col='value')),
    ('preprocessor', StandardScaler()),
    ('imputer',SimpleImputer(strategy='mean')),
    ("kmeans", KMeans())
])
param_grid = [{'kmeans__n_clusters': [4]
               }]
    
grid_search = GridSearchCV(clf,
                            param_grid,
                            cv=5,
                            scoring=['precision_weighted','recall_weighted','f1_weighted', 'balanced_accuracy'],
                            refit='f1_weighted',
                            n_jobs=-1,
                            return_train_score=True,
                            error_score='raise')
    
grid_search.fit(X, y)


  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group, zeros_to_pad], ignore_index=True)
  padded_group = pd.concat([group,

ValueError: Found input variables with inconsistent numbers of samples: [74, 36334]

In [4]:

results = pd.DataFrame(grid_search.cv_results_)
print(grid_search.best_params_)
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_
train_score = results['mean_train_f1_weighted'].iloc[0]
test_score = results['mean_test_f1_weighted'].iloc[0]
print(f"Best parameters for: {best_model}")
print(f"Best score for: {best_score}")
print(f"train score: {train_score}, test_score: {test_score}")

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'