In [35]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import pickle

In [6]:
df = pd.read_csv("turkish_movie_sentiment_dataset.csv")

In [8]:
df.isnull().sum()

comment      0
film_name    0
point        0
dtype: int64

In [10]:
df.shape

(83227, 3)

In [13]:
stop_words_list = stopwords.words("turkish")

In [14]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\t", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [word for word in words if not word in stop_words_list]
    words = [word.strip() for word in words if len(word.strip()) > 1]
    text = " ".join(words)
    return text

In [15]:
df["comment"] = df["comment"].apply(preprocess)

In [23]:
def int_point(point):
    return point[0]

In [24]:
df["point"] = df["point"].apply(int_point)

In [25]:
df["point"] = df["point"].astype("int")

In [29]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [31]:
def lemmatizing(text):
    stemmed_text = ""
    for word in text.split():
        stem = lemmatizer.lemmatize(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [32]:
df["comment"] = df["comment"].apply(lemmatizing)

In [33]:
def stemming(text):
    stemmed_text = ""
    for word in text.split():
        stem = stemmer.stem(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [34]:
df["comment"] = df["comment"].apply(stemming)

In [36]:
X = df["comment"]
y = df["point"]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

In [38]:
vectorizer = CountVectorizer(max_features=1000)
cv_train = vectorizer.fit_transform(X_train)
cv_test = vectorizer.transform(X_test)

In [39]:
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(cv_train.toarray())
X_test_pca = pca.transform(cv_test.toarray())

In [40]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train_pca, y_train)
y_pred = rf_model.predict(X_test_pca)
print("Random Forest: ", mean_absolute_error(y_pred, y_test))

RandomForest:  1.0405936625843208


In [41]:
linreg = LinearRegression()
linreg.fit(X_train_pca, y_train)
y_pred = linreg.predict(X_test_pca)
print("Linear Regression: ", mean_absolute_error(y_pred, y_test))

Linear Regression:  1.07319397878553


In [43]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_pca, y_train)
y_pred = dt_model.predict(X_test_pca)
print("Decision Tree: ", mean_absolute_error(y_pred, y_test))

Decision Tree:  1.3954030207462091


In [44]:
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_pca, y_train)
y_pred = knn_model.predict(X_test_pca)
print("KNN: ", mean_absolute_error(y_pred, y_test))

KNN:  1.1324882854739877


In [46]:
lasso = Lasso()
lasso.fit(X_train_pca, y_train)
y_pred = lasso.predict(X_test_pca)
print("Lasso: ", mean_absolute_error(y_pred, y_test))

Lasso:  1.1966809249440982


In [47]:
ridge = Ridge()
ridge.fit(X_train_pca, y_train)
y_pred = ridge.predict(X_test_pca)
print("Ridge: ", mean_absolute_error(y_pred, y_test))

Ridge:  1.0731973624501523


In [48]:
elastic_net = ElasticNet()
elastic_net.fit(X_train_pca, y_train)
y_pred = elastic_net.predict(X_test_pca)
print("Elastic Net: ", mean_absolute_error(y_pred, y_test))

Elastic Net:  1.1966809249440982
