In [1]:
import pandas as pd

In [3]:
file_path = 'computer_science_books_dataset.csv'
books_df = pd.read_csv(file_path, encoding='unicode_escape', low_memory=False, dtype={'column_name': str})

In [4]:
columns_to_drop = [
    'author_alternative_name', 'author_key', 'contributor', 'cover_edition_key', 'cover_i', 
    'ddc', 'ebook_access', 'ebook_count_i', 'edition_count', 'first_publish_year', 'key',
    'language', 'last_modified_i', 'place_key', 'place_facet', 'time_key', 'lcc_sort', 
    'author_facet', 'subject_facet', '_version_', 'subject_key', 'ddc_sort'
]
books_df_cleaned = books_df.drop(columns=columns_to_drop)

In [5]:
books_df_cleaned = books_df_cleaned.dropna(subset=['title', 'author_name'])

In [6]:
books_df_cleaned['ratings_average'] = pd.to_numeric(books_df_cleaned['ratings_average'], errors='coerce')

In [7]:
mean_rating = books_df_cleaned['ratings_average'].mean()
books_df_cleaned['ratings_average'].fillna(mean_rating, inplace=True)

In [8]:
columns_to_keep = ['title', 'author_name', 'ratings_average', 'ratings_count']
books_df_final = books_df_cleaned[columns_to_keep]

In [9]:
print(books_df_final.head())

                                title           author_name  ratings_average  \
0                                2001  ['Arthur C. Clarke']         4.163793   
1                            I, Robot      ['Isaac Asimov']         4.233766   
2                                Prey  ['Michael Crichton']         3.605263   
3                         Neuromancer    ['William Gibson']         3.970149   
4  Artemis Fowl and the Eternity Code       ['Eoin Colfer']         4.350000   

   ratings_count  
0          116.0  
1          154.0  
2           38.0  
3           67.0  
4           20.0  


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [11]:
label_encoder_title = LabelEncoder()
label_encoder_author = LabelEncoder()

In [12]:
books_df_final['title_encoded'] = label_encoder_title.fit_transform(books_df_final['title'])
books_df_final['author_encoded'] = label_encoder_author.fit_transform(books_df_final['author_name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_final['title_encoded'] = label_encoder_title.fit_transform(books_df_final['title'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_final['author_encoded'] = label_encoder_author.fit_transform(books_df_final['author_name'])


In [13]:
X = books_df_final[['title_encoded', 'author_encoded', 'ratings_count']]
y = books_df_final['ratings_average']

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.impute import SimpleImputer
# Impute missing values with the mean (you can also use 'median', 'most_frequent', etc.)
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [26]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_imputed, y_train)

In [20]:
y_pred = knn.predict(X_test_imputed)

In [21]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.0805343117836928


In [24]:
 #Drop rows with missing values
X_train_dropna = X_train[~pd.isnull(X_train).any(axis=1)]
y_train_dropna = y_train[~pd.isnull(X_train).any(axis=1)]

X_test_dropna = X_test[~pd.isnull(X_test).any(axis=1)]
y_test_dropna = y_test[~pd.isnull(X_test).any(axis=1)]

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_dropna, y_train_dropna)

y_pred = knn.predict(X_test_dropna)
mse = mean_squared_error(y_test_dropna, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 1.1585654515014496
