In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

filepath = "GroupExam2024_data_and_documentation/elektronisk-rapportering-ers-2018-fangstmelding-dca-simple.csv"
#Leser filen
df = pd.read_csv(filepath, sep=';', decimal=',')

In [15]:
#Sile ut informasjon:

df['Bruttotonnasje Kombinert'] = df['Bruttotonnasje 1969'].fillna(df['Bruttotonnasje annen'])

# Dropper unødvendig informasjon. Tabellen inneholder masse duplikater. som egen kolonne for navn og for kode
# Forskjellige klassifiseringsmåter. Holder på FDIR sine.

df.drop(columns = ['Meldingsdato', 
                   'Meldingsklokkeslett', 
                   'Startdato', 
                   'Startklokkeslett', 
                   'Hovedområde start', 
                   'Lokasjon start (kode)', 
                   'Stoppdato', 
                   'Stoppklokkeslett', 
                   'Fangstår', 
                   'Hovedområde stopp (kode)', 
                   'Hovedområde stopp', 
                   'Lokasjon stopp (kode)', 
                   'Redskap FAO (kode)', 
                   'Redskap FAO', 
                   'Redskap FDIR', 
                   'Hovedart FAO (kode)', 
                   'Hovedart FAO', 
                   'Art FAO (kode)',
                   'Art FAO',
                   'Art - gruppe', 
                   'Lengdegruppe', 
                   'Bredde', 
                   'Fartøylengde',
                   'Bruttotonnasje 1969',
                   'Bruttotonnasje annen'
    ], inplace=True)

In [16]:
# Siden Hyse og torsk ofte fiskes samtidig:
filtered_df = df[(['Art - FDIR (kode)'] == 1022) | (df['Art - FDIR (kode)'] == 1027)]
filtered_df

Unnamed: 0,Melding ID,Meldingstidspunkt,Starttidspunkt,Startposisjon bredde,Startposisjon lengde,Hovedområde start (kode),Havdybde start,Stopptidspunkt,Varighet,Stopposisjon bredde,...,Havdybde stopp,Trekkavstand,Redskap FDIR (kode),Hovedart - FDIR (kode),Art - FDIR (kode),Art - FDIR,Art - gruppe (kode),Rundvekt,Lengdegruppe (kode),Bruttotonnasje Kombinert
1,1497178,01.01.2018,30.12.2017 23:21,74.885,16.048,20.0,-335,31.12.2017 04:16,295,74.914,...,-334,3970.0,51.0,1027.0,1027.0,Hyse,202.0,9594.0,5.0,1476.0
5,1497178,01.01.2018,31.12.2017 05:48,74.910,15.868,20.0,-403,31.12.2017 10:15,267,74.901,...,-277,11096.0,51.0,1027.0,1027.0,Hyse,202.0,9118.0,5.0,1476.0
11,1497178,01.01.2018,31.12.2017 11:34,74.883,16.056,20.0,-346,31.12.2017 16:49,315,74.924,...,-496,10215.0,51.0,1027.0,1027.0,Hyse,202.0,12432.0,5.0,1476.0
18,1497178,01.01.2018,31.12.2017 17:44,74.931,15.785,20.0,-443,31.12.2017 21:47,243,74.926,...,-358,3214.0,51.0,1022.0,1027.0,Hyse,202.0,6758.0,5.0,1476.0
19,1497229,01.01.2018 15:49,01.01.2018 10:01,67.828,12.972,5.0,-71,01.01.2018 11:04,63,67.827,...,-56,1269.0,61.0,1027.0,1027.0,Hyse,202.0,4.0,3.0,51.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305405,1800286,01.01.2019 07:09,31.12.2018 15:07,70.862,22.141,4.0,-191,31.12.2018 18:59,232,70.877,...,-178,2400.0,51.0,1032.0,1027.0,Hyse,202.0,258.0,5.0,691.0
305407,1800291,01.01.2019 09:28,30.12.2018 23:20,76.509,14.295,21.0,-193,31.12.2018 04:21,301,76.431,...,-341,13284.0,51.0,1027.0,1027.0,Hyse,202.0,7277.0,5.0,1483.0
305415,1800291,01.01.2019 09:28,31.12.2018 05:40,76.385,14.932,21.0,-244,31.12.2018 11:25,345,76.640,...,-299,44035.0,51.0,1027.0,1027.0,Hyse,202.0,6182.0,5.0,1483.0
305423,1800291,01.01.2019 09:28,31.12.2018 12:53,76.606,13.800,21.0,-280,31.12.2018 18:21,328,76.899,...,-289,41216.0,51.0,1022.0,1027.0,Hyse,202.0,4315.0,5.0,1483.0


In [17]:
# Samle data om tokter der meldingsID er lik og starttidspunkt er lik
aggregated_trips = df.groupby(['Melding ID', 'Starttidspunkt']).agg({
    'Rundvekt': 'sum',  # Sum the total catch weight for each trip
    'Art - FDIR': lambda x: ', '.join(x.astype(str).unique()),  # Join unique species names
    # Add other columns as needed, specifying how to aggregate each
}).reset_index()

# Renaming columns for clarity if necessary
aggregated_trips.rename(columns={'Rundvekt': 'TotalFangst', 'Art - FDIR': 'ArtsListe'}, inplace=True)

# Display the aggregated DataFrame
aggregated_trips

Unnamed: 0,Melding ID,Starttidspunkt,TotalFangst,ArtsListe
0,1497177,31.12.2017,706714.0,Antarktisk krill
1,1497178,30.12.2017 23:21,18434.0,"Hyse, Torsk, Blåkveite, Sei"
2,1497178,31.12.2017 05:48,16062.0,"Hyse, Torsk, Blåkveite, Flekksteinbit, Sei, Kv..."
3,1497178,31.12.2017 11:34,17878.0,"Hyse, Torsk, Blåkveite, Sei, Snabeluer, Uer (v..."
4,1497178,31.12.2017 17:44,13780.0,"Torsk, Hyse"
...,...,...,...,...
100074,1800286,31.12.2018 15:07,9023.0,"Sei, Torsk, Hyse, Kveite"
100075,1800291,30.12.2018 23:20,11105.0,"Hyse, Torsk, Blåkveite, Gråsteinbit, Flekkstei..."
100076,1800291,31.12.2018 05:40,10017.0,"Hyse, Torsk, Gråsteinbit, Flekksteinbit, Uer (..."
100077,1800291,31.12.2018 12:53,10427.0,"Torsk, Hyse, Uer (vanlig), Blåkveite, Kveite"


In [18]:
# Droppe nullverdier
df_cleaned = filtered_df.dropna(subset=['Bruttotonnasje Kombinert','Varighet', 'Havdybde start', 'Rundvekt'])

In [19]:
# Assuming `df` is your DataFrame
# Select features and target
X = df_cleaned[['Bruttotonnasje Kombinert','Varighet', 'Rundvekt']]  # Example features
y = df_cleaned['Havdybde start']  # Target variable


In [28]:


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN Regressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
print(f"KNN RMSE: {mean_squared_error(y_test, y_pred_knn, squared=False)}")
print(f'Score: {knn.score(X_test, y_pred_knn)}')

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print(f"Linear Regression RMSE: {mean_squared_error(y_test, y_pred_lr, squared=False)}")
print(f'Score: {lr.score(X_test, y_pred_knn)}')

KNN RMSE: 115.9967020099064
Score: -5.160856350548047
Linear Regression RMSE: 120.12048476745328
Score: 0.2943966356066382




X = df_cleaned[['Bruttotonnasje Kombinert','Varighet', 'Rundvekt']]  # Example features
y = df_cleaned['Havdybde start']  # Target variable

All fisk:
Resultat med 5 nermeste naboer:
KNN RMSE: 190.3080985365868
Linear Regression RMSE: 200.28353086847204

10 nermeste:
KNN RMSE: 185.96015957933645
Linear Regression RMSE: 200.28353086847204

15 nermeste:
KNN RMSE: 184.5590055924737
Linear Regression RMSE: 200.28353086847204


Filtrert for bare torsk og hyse k=15:
KNN RMSE: 113.03576808665306
Linear Regression RMSE: 120.12048476745328

KNN RMSE: 112.35307942799963
Linear Regression RMSE: 120.12048476745328