In [60]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [61]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,PROJE,Unnamed: 1,REFERANS,Unnamed: 3,SAC\nKALINLIĞI\n(mm),Unnamed: 5,AKMA\nMUKAVEMETİ\n(Mpa),Unnamed: 7,ÇEKME\nMUKAVEMETİ\n(Mpa),Unnamed: 9,...,Unnamed: 19,KALIP AĞIRLIĞI\n(TON),Unnamed: 21,KALIP\nTİPİ,Unnamed: 23,AÇINIM\nKALIBI\nAĞIRLIĞI\n(ton),Unnamed: 25,MALİYET\n(EURO),Unnamed: 27,EURO/TON
0,,,,,,,,,,,...,,,,,,,,,,
1,TOGG BIW,,REF01,,0.7,,163.7,,282.0,,...,,237.08,,TANDEM,,52.0,,844580.0,,2923.03
2,,,,,,,,,,,...,,,,,,,,,,
3,TOGG BIW,,REF02,,0.7,,163.7,,282.0,,...,,237.08,,TANDEM,,0.0,,746595.0,,3149.13
4,,,,,,,,,,,...,,,,,,,,,,


In [62]:
# remove empty columns
data = data.filter(regex='^(?!Unnamed)', axis=1)

# replace unnecessaary "new line" escape characters
for col in data.columns:
    data = data.rename(columns={col:col.lower().replace("\n"," ")})

data.columns = [re.sub(r'\s', '-', col) for col in data.columns]
data.head()


Unnamed: 0,proje,referans,sac-kalinliği-(mm),akma-mukavemeti̇-(mpa),çekme-mukavemeti̇-(mpa),bant-genisligi-(mm),hatve-(mm),operasyon--sayisi,parça-/-vuruş,kamli-deli̇k,kalip-ağirliği-(ton),kalip-ti̇pi̇,açinim-kalibi-ağirliği-(ton),mali̇yet-(euro),euro/ton
0,,,,,,,,,,,,,,,
1,TOGG BIW,REF01,0.7,163.7,282.0,1790.0,3380.0,4.0,1.0,0.0,237.08,TANDEM,52.0,844580.0,2923.03
2,,,,,,,,,,,,,,,
3,TOGG BIW,REF02,0.7,163.7,282.0,1790.0,3380.0,4.0,1.0,0.0,237.08,TANDEM,0.0,746595.0,3149.13
4,,,,,,,,,,,,,,,


In [63]:
# select empty rows
empty_rows = data[data.isnull().sum(axis=1) == len(data.columns)].index

# drop empty rows
data.drop(labels=empty_rows, inplace=True)

# reset indexes [1,3,5,...] --> [0,1,2,...]
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,proje,referans,sac-kalinliği-(mm),akma-mukavemeti̇-(mpa),çekme-mukavemeti̇-(mpa),bant-genisligi-(mm),hatve-(mm),operasyon--sayisi,parça-/-vuruş,kamli-deli̇k,kalip-ağirliği-(ton),kalip-ti̇pi̇,açinim-kalibi-ağirliği-(ton),mali̇yet-(euro),euro/ton
0,TOGG BIW,REF01,0.7,163.7,282.0,1790.0,3380.0,4.0,1.0,0.0,237.08,TANDEM,52.0,844580.0,2923.03
1,TOGG BIW,REF02,0.7,163.7,282.0,1790.0,3380.0,4.0,1.0,0.0,237.08,TANDEM,0.0,746595.0,3149.13
2,TOGG BIW,REF05-06,0.65,150.9,286.7,1130.0,2520.0,4.0,2.0,0.0,110.31,TANDEM,0.0,442649.0,4012.77
3,TOGG BIW,REF07-08,0.65,150.9,286.7,1155.0,2470.0,4.0,2.0,0.0,115.59,TANDEM,0.0,430899.0,3727.82
4,TOGG BIW,REF56-57,0.7,158.2,288.0,1660.0,2805.0,4.0,2.0,0.0,174.39,TANDEM,28.46,740919.0,3652.55


In [64]:
# change column names to english
data.columns = ['project', 'reference', 'sheet-thickness', 'yield-strength', 'tensile-strength', 'band-width', 'pitch', 
'operation-number', 'piece-beat', 'cam-hole', 'mold-weight', 'mold-type', 'opening-mold-weight', 'cost', 'euro']
data.head()

Unnamed: 0,project,reference,sheet-thickness,yield-strength,tensile-strength,band-width,pitch,operation-number,piece-beat,cam-hole,mold-weight,mold-type,opening-mold-weight,cost,euro
0,TOGG BIW,REF01,0.7,163.7,282.0,1790.0,3380.0,4.0,1.0,0.0,237.08,TANDEM,52.0,844580.0,2923.03
1,TOGG BIW,REF02,0.7,163.7,282.0,1790.0,3380.0,4.0,1.0,0.0,237.08,TANDEM,0.0,746595.0,3149.13
2,TOGG BIW,REF05-06,0.65,150.9,286.7,1130.0,2520.0,4.0,2.0,0.0,110.31,TANDEM,0.0,442649.0,4012.77
3,TOGG BIW,REF07-08,0.65,150.9,286.7,1155.0,2470.0,4.0,2.0,0.0,115.59,TANDEM,0.0,430899.0,3727.82
4,TOGG BIW,REF56-57,0.7,158.2,288.0,1660.0,2805.0,4.0,2.0,0.0,174.39,TANDEM,28.46,740919.0,3652.55


In [65]:
# selecting features and target value

features = ['sheet-thickness', 'yield-strength', 'tensile-strength', 'mold-weight']
target = 'cost'

X = data[features]
y = data[target]

In [66]:
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [67]:
# normalization to balance features
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [68]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [69]:
models = {
    'linear': LinearRegression(),
    'decision_tree': DecisionTreeRegressor(),
    'random_forest': RandomForestRegressor(),
    'xgboost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'svm': SVR()
}

In [70]:
scores = {}

for model in models:
    models[model].fit(X_train, y_train)
    scores[model] = models[model].score(X_test, y_test)

scores

{'linear': 0.976634538570231,
 'decision_tree': 0.9732782298557404,
 'random_forest': 0.9754165900966872,
 'xgboost': 0.9816062689945485,
 'knn': 0.8834759658063716,
 'svm': -0.14304364603441466}

In [71]:
import joblib

# save models
for model in models:
    models[model].fit(X, y)
    joblib.dump(models[model], f"models/{model}.pkl")

# save data
data.to_csv('data/data.csv')