In [1]:
! pip install pandas sklearn nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting sklearn
  Downloading sklearn-0.0.post7.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.4/770.4 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post7-py3-none-any.whl size=2360 sha256=536525d5eaacf6c19f1aebc7ac1fe617173ee41b7f0b24f54ada1ed4390102be
  Stored in directory: /home/thechemist54/.cache/pip/wheels/c8/9c/85/72901eb5

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/thechemist54/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thechemist54/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/thechemist54/nltk_data...


True

In [11]:
# Our unstructured data
data = [
    ["To Kill a Mockingbird", "Harper Lee", "JohnDoe", "2023-06-23", "An unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it", 5],
    ["1984", "George Orwell", "JaneDoe", "2023-06-24", "A frighteningly prophetic novel about the power of government surveillance and public manipulation", 4],
    ["Moby Dick", "Herman Melville", "SamDoe", "2023-06-25", "A seemingly endless, convoluted whaling voyage filled with intricate details about the sea... Not my cup of tea", 2],
    ["Pride and Prejudice", "Jane Austen", "AnnDoe", "2023-06-26", "A classic novel that blends romance with wit, and social criticism. Loved it", 5],
    ["The Great Gatsby", "F. Scott Fitzgerald", "RickDoe", "2023-06-27", "This novel provides an incredibly discerning glimpse into the American soul, but the characters felt somewhat hollow", 3],
    ["War and Peace", "Leo Tolstoy", "EveDoe", "2023-06-28", "A grand narrative that combines history, philosophy, and keen insight into the human heart. A challenging yet rewarding read", 4],
    ["Crime and Punishment", "Fyodor Dostoevsky", "TomDoe", "2023-06-29", "A brilliant exploration of the human psyche when faced with profound moral dilemmas. Dark but profound", 4],
    ["Wuthering Heights", "Emily Bronte", "ZaraDoe", "2023-06-30", "A dramatic narrative filled with raw emotions. The gothic atmosphere was spellbinding, though the story was slightly melodramatic", 3]
]


In [12]:
# Convert to DataFrame
df = pd.DataFrame(data, columns=["Title", "Author", "Reviewer", "Date", "Review", "Rating"])


In [13]:
# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [14]:
def preprocess_text(text):
    tokenized_text = nltk.word_tokenize(text.lower())
    cleaned_text = [lemmatizer.lemmatize(t) for t in tokenized_text if t not in stop_words and t.isalpha()]
    return " ".join(cleaned_text)

In [15]:
df["Cleaned_Review"] = df["Review"].apply(preprocess_text)

In [16]:
# Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Cleaned_Review"])
y = df["Rating"]

In [38]:
# Convert the matrix to a dense format
dense_X = X.toarray()

In [44]:
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()


In [45]:
# Create a DataFrame from the dense matrix
df_tfidf = pd.DataFrame(dense_X, columns=feature_names)


In [46]:
# Display the first few rows of the DataFrame
print(df_tfidf.head())

   american  atmosphere     blend  brilliant  challenging  character   
0  0.000000         0.0  0.000000        0.0          0.0   0.000000  \
1  0.000000         0.0  0.000000        0.0          0.0   0.000000   
2  0.000000         0.0  0.000000        0.0          0.0   0.000000   
3  0.000000         0.0  0.367556        0.0          0.0   0.000000   
4  0.310056         0.0  0.000000        0.0          0.0   0.310056   

   childhood   classic  combine  conscience  ...  story  surveillance   
0   0.344991  0.000000      0.0    0.344991  ...    0.0      0.000000  \
1   0.000000  0.000000      0.0    0.000000  ...    0.0      0.367556   
2   0.000000  0.000000      0.0    0.000000  ...    0.0      0.000000   
3   0.000000  0.367556      0.0    0.000000  ...    0.0      0.000000   
4   0.000000  0.000000      0.0    0.000000  ...    0.0      0.000000   

        tea  though      town  unforgettable    voyage   whaling       wit   
0  0.000000     0.0  0.344991       0.344991  0.00

In [17]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Model Building
model = LogisticRegression()
model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.0
Confusion Matrix:
 [[0 0 0]
 [1 0 1]
 [0 0 0]]


In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.utils import np_utils

2023-07-23 15:50:24.338225: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-23 15:50:24.575968: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-23 15:50:24.577425: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
# Convert ratings into a binary classification problem
df['Rating'] = df['Rating'].apply(lambda x: 1 if x >= 4 else 0)

In [21]:
# Encode the text data to integers
encoder = LabelEncoder()
encoder.fit(df['Cleaned_Review'])
encoded_X = encoder.transform(df['Cleaned_Review'])

In [22]:
# Convert integers to one-hot encoded data
dummy_X = np_utils.to_categorical(encoded_X)


In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dummy_X, df['Rating'], test_size=0.2, random_state=1)

In [27]:
# Define the neural network model
model1 = Sequential()
model1.add(Dense(8, input_dim=dummy_X.shape[1], activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

In [35]:
# Compile the model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
# Train the model
model1.fit(X_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2c6c6216c0>

In [37]:
# Evaluate the model
scores = model.evaluate(X_test, y_test)
print("\nAccuracy: %.2f%%" % (scores[1]*100))


Accuracy: 50.00%
