In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [21]:
import boto3
bucket_name = 'dataminds-homeworks'
s3_file_key = 'data_usage_production.parquet'            # e.g. 'folder/myfile.txt'
local_file_path = 'data_usage_production.parquet'        # Local destination

# Create an S3 client (remove `bucket_name` here — not a valid argument for boto3.client)
s3 = boto3.client(
    's3',
    region_name='us-east-1'
    # aws_access_key_id='your_access_key',
    # aws_secret_access_key='your_secret_key'
)

# Download the file
try:
    s3.download_file(bucket_name, s3_file_key, local_file_path)
    print(f"✅ File downloaded successfully from s3://{bucket_name}/{s3_file_key} to {local_file_path}")
except Exception as e:
    print("❌ Error downloading file:", e)

✅ File downloaded successfully from s3://dataminds-homeworks/data_usage_production.parquet to data_usage_production.parquet


In [22]:
df=pd.read_parquet('data_usage_production.parquet')

In [23]:
cat_first = df.select_dtypes(include='object').columns.tolist()
num_first = df.select_dtypes(include='number')
needed=num_first.corr()['data_compl_usg_local_m1'].abs().sort_values(ascending=False)
features_keeped=needed.drop("data_compl_usg_local_m1").head(8).index.tolist()
 
corr_features = df[features_keeped].corr()
trin = corr_features.where(~np.tril(np.ones(corr_features.shape), k=0).astype(bool))
high_pairs=[
    column for column in trin.columns
    if any(trin[column].abs()>0.8)
]
final_nums=[
    feature for feature in features_keeped
    if feature not in high_pairs
]
columns_keeped=cat_first+final_nums+["data_compl_usg_local_m1"]
data=df[columns_keeped].sample(n=10_000,random_state=42)

In [24]:
columns_keeped

['telephone_number',
 'tariff_desc',
 'customer_status',
 'lasttariff_m2',
 'lasttariff_m3',
 'lasttariff_m4',
 'lasttariff_m5',
 'lasttariff_m6',
 'data_compl_usg_local_m2',
 'data_compl_usg_local_m4',
 'dpi_tik_tok_m2',
 'data_pack_rev_local_m2',
 'data_compl_usg_local_m1']

In [25]:
X=data.drop(columns="data_compl_usg_local_m1")
y=data["data_compl_usg_local_m1"]


In [26]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
numeric_features=X.select_dtypes(include=["int64","float64"]).columns.tolist()
numerical_transformer=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

In [28]:
categorical_features=X.select_dtypes(include=["object"]).columns.tolist()
categorical_transformer=Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ("selector", SelectPercentile(f_regression, percentile=50))
])

In [29]:
preprocessor=ColumnTransformer(
    transformers=[
        ("num",numerical_transformer,numeric_features),
        ("cat",categorical_transformer,categorical_features)  
])

In [30]:
preprocessor

In [31]:
res_pipeline=Pipeline(
    steps=[("preprocessor",preprocessor),
           ("regressor",RandomForestRegressor(n_estimators=100,random_state=42))
])

In [32]:
res_pipeline

In [33]:
res_pipeline.fit(X_train,y_train)

In [34]:
y_predicted=res_pipeline.predict(X_test)

In [35]:
y_predicted

array([  133.996     ,    55.87925223, 26155.1787    , ...,
          26.29982354,  1180.7547    ,    55.87925223])

In [38]:
r2=r2_score(y_test,y_predicted)

In [39]:
r2

0.6492071750953843