In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer

# Load data

df = pd.read_csv("books_task.csv")

# Data Cleaning
# Convert publishedDate to datetime
df['publishedDate'] = pd.to_datetime(df['publishedDate'], errors='coerce')

# Handle missing values
df['description'].fillna('', inplace=True)

# Feature Engineering
# Extract year and month from publishedDate
df['publishedYear'] = df['publishedDate'].dt.year
df['publishedMonth'] = df['publishedDate'].dt.month

# Convert authors, categories, and title to string
def authors_clean(x):
    if pd.isna(x):
        return ""
    x = x.replace("[","")
    x = x.replace("]","")
    return ', '.join(x.split(","))

df["authors"] = df["authors"].apply(authors_clean)
#df['authors'] = df['authors'].apply(lambda x: ', '.join(x.replace("[","").replace("]","").split(",")))
#df['categories'] = df['categories'].apply(lambda x: ', '.join(x))
df['Title'] = df['Title'].astype(str)

# Feature Engineering for 'Title'
# Using TF-IDF to extract features from title
tfidf_title = TfidfVectorizer(stop_words='english', max_features=100)
title_features = tfidf_title.fit_transform(df['Title']).toarray()
title_feature_names = tfidf_title.get_feature_names_out()

# Convert to DataFrame and concatenate with original DataFrame
title_df = pd.DataFrame(title_features, columns=['title_' + name for name in title_feature_names])
df = pd.concat([df, title_df], axis=1)

# Feature Engineering for 'categories'
# Using TF-IDF to extract features from categories
# Feature Engineering for 'categories'
# Convert categories to one-hot encoded features
mlb = MultiLabelBinarizer()
categories_features = mlb.fit_transform(df['categories'].apply(lambda x: x.split(', ')))
categories_feature_names = mlb.classes_

# Convert to DataFrame and concatenate with original DataFrame
categories_df = pd.DataFrame(categories_features, columns=['category_' + name for name in categories_feature_names])
#df = df.drop(["categories"],axis=1)
df = pd.concat([df, categories_df], axis=1)


# Feature Engineering for 'description'
# Using TF-IDF to extract features from description
tfidf_description = TfidfVectorizer(stop_words='english', max_features=100)
description_features = tfidf_description.fit_transform(df['description']).toarray()
description_feature_names = tfidf_description.get_feature_names_out()

# Convert to DataFrame and concatenate with original DataFrame
description_df = pd.DataFrame(description_features, columns=['desc_' + name for name in description_feature_names])
df = pd.concat([df, description_df], axis=1)

# Feature Engineering for 'authors'
# Using TF-IDF to extract features from authors
tfidf_authors = TfidfVectorizer(stop_words='english', max_features=100)
authors_features = tfidf_authors.fit_transform(df['authors']).toarray()
authors_feature_names = tfidf_authors.get_feature_names_out()

# Convert to DataFrame and concatenate with original DataFrame
authors_df = pd.DataFrame(authors_features, columns=['author_' + name for name in authors_feature_names])
df = pd.concat([df, authors_df], axis=1)

# Feature Engineering for 'publisher'
# Using TF-IDF to extract features from publisher
tfidf_publisher = TfidfVectorizer(stop_words='english', max_features=100)
publisher_features = tfidf_publisher.fit_transform(df['publisher']).toarray()
publisher_feature_names = tfidf_publisher.get_feature_names_out()

# Convert to DataFrame and concatenate with original DataFrame
publisher_df = pd.DataFrame(publisher_features, columns=['publisher_' + name for name in publisher_feature_names])
df = pd.concat([df, publisher_df], axis=1)

# Regression
# Prepare features and target variable
X = df.drop(['Title', 'description', 'authors', 'publisher', 'publishedDate', 'categories', 'Impact'], axis=1)
y = df['Impact']

X = X.fillna(0)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna('', inplace=True)


Mean Squared Error: 3705.4575570253664


In [2]:
for x in df.columns:
    print(x)

Unnamed: 0
Title
description
authors
publisher
publishedDate
categories
Impact
publishedYear
publishedMonth
title_age
title_america
title_american
title_art
title_best
title_bible
title_black
title_book
title_books
title_business
title_cd
title_century
title_child
title_children
title_christian
title_city
title_classics
title_collection
title_complete
title_contemporary
title_culture
title_day
title_death
title_design
title_dictionary
title_edition
title_encyclopedia
title_english
title_family
title_god
title_good
title_great
title_guide
title_guides
title_handbook
title_health
title_heart
title_history
title_home
title_house
title_ii
title_introduction
title_journey
title_language
title_library
title_life
title_literature
title_little
title_living
title_love
title_making
title_man
title_management
title_manual
title_men
title_modern
title_music
title_mystery
title_new
title_novel
title_old
title_people
title_philosophy
title_poems
title_politics
title_power
title_practical
title_pract

In [3]:
X

Unnamed: 0.1,Unnamed: 0,publishedYear,publishedMonth,title_age,title_america,title_american,title_art,title_best,title_bible,title_black,...,publisher_uk,publisher_univ,publisher_university,publisher_usa,publisher_vintage,publisher_wiley,publisher_wm,publisher_york,publisher_young,publisher_zondervan
0,0,1996.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
2,2,2000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.671742,0.0,0.0,0.000000,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
4,5,1996.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.640061,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138719,212397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
138720,212398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
138721,212399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
138722,212400,1995.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.640061,0.0,0.0,0.0


In [4]:
X_test

Unnamed: 0.1,Unnamed: 0,publishedYear,publishedMonth,title_age,title_america,title_american,title_art,title_best,title_bible,title_black,...,publisher_uk,publisher_univ,publisher_university,publisher_usa,publisher_vintage,publisher_wiley,publisher_wm,publisher_york,publisher_young,publisher_zondervan
90643,136496,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33022,46068,2012.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,5538,2005.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69527,106670,2004.0,1.0,0.0,0.638731,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64386,99530,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88145,132974,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1445,2054,1995.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95832,144435,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55294,82391,2005.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df[df["Unnamed: 0"] == 139484][["Impact"]]

Unnamed: 0,Impact
92608,682.004019
