In [17]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score

In [2]:
df=pd.read_csv("kidsInMindSubtitles2004 (2).csv")

In [37]:
X_text = df['subtitles'].tolist()

y = np.array(df['Language'].tolist())

In [38]:
vectorizer = TfidfVectorizer()
X_numerical = vectorizer.fit_transform(X_text)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_numerical, y, test_size=0.1, random_state=42)

In [40]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [42]:
rf_regressor.fit(X_train, y_train)

In [43]:
y_pred = rf_regressor.predict(X_test)

In [44]:
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
explained_variance = explained_variance_score(y_test, y_pred)

In [45]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Explained Variance Score (R^2): {explained_variance}")

Mean Squared Error: 3.591072660978695
R-squared: 0.5164998381160262
Mean Absolute Error (MAE): 1.2744270907367925
Root Mean Squared Error (RMSE): 1.8950125754143943
Explained Variance Score (R^2): 0.5172561795663297


In [46]:
with open('models/random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_regressor, file)

In [8]:
import xgboost as xgb

In [9]:
xgb_regressor = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

In [10]:
xgb_regressor.fit(X_train, y_train)

In [13]:
y_pred = xgb_regressor.predict(X_test)

In [18]:
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
explained_variance = explained_variance_score(y_test, y_pred)

In [19]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Explained Variance Score (R^2): {explained_variance}")

Mean Squared Error: 3.745514668239984
R-squared: 0.49570584630295944
Mean Absolute Error (MAE): 1.2866645138062054
Root Mean Squared Error (RMSE): 1.9353332189160564
Explained Variance Score (R^2): 0.4960277717510768


In [22]:
with open('models/xgboost_model.pkl', 'wb') as file:
    pickle.dump(xgb_regressor, file)

In [23]:
from sklearn.svm import SVR

In [24]:
svr_regressor = SVR(kernel='linear')

In [25]:
svr_regressor.fit(X_train, y_train)

In [26]:
y_pred = svr_regressor.predict(X_test)

In [27]:
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
explained_variance = explained_variance_score(y_test, y_pred)

In [28]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Explained Variance Score (R^2): {explained_variance}")

Mean Squared Error: 4.703111370240178
R-squared: 0.36677552265128766
Mean Absolute Error (MAE): 1.622290145075247
Root Mean Squared Error (RMSE): 2.1686658041847244
Explained Variance Score (R^2): 0.368162899393658


In [29]:
with open('models/svm_model.pkl', 'wb') as file:
    pickle.dump(svr_regressor, file)

In [30]:
from sklearn.tree import DecisionTreeRegressor

In [31]:
decision_tree_regressor = DecisionTreeRegressor(max_depth=5, random_state=42)

In [32]:
decision_tree_regressor.fit(X_train, y_train)

In [33]:
y_pred = decision_tree_regressor.predict(X_test)

In [34]:
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
explained_variance = explained_variance_score(y_test, y_pred)

In [35]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Explained Variance Score (R^2): {explained_variance}")

Mean Squared Error: 5.560808407273763
R-squared: 0.25129563811446975
Mean Absolute Error (MAE): 1.5656174077766754
Root Mean Squared Error (RMSE): 2.3581366388048344
Explained Variance Score (R^2): 0.2512956691447472


In [36]:
with open('models/decision_tree_model.pkl', 'wb') as file:
    pickle.dump(decision_tree_regressor, file)
