In [15]:
import pandas as pd

header=['label', 'text']
text = pd.read_table('train_orig.txt', header=None)
text.columns = header

replace_dict = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1}
text['label'] = text['label'].replace(replace_dict)

# preprocess data

In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split


# Define stop words
stop_words = set(stopwords.words('english'))

# Define stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Apply stemming or lemmatization
    # Uncomment one of the following lines to use stemming or lemmatization
    # stemmed_tokens = [stemmer.stem(word) for word in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join the tokens back into a string
    # Uncomment one of the following lines to use stemmed or lemmatized tokens
    # preprocessed_text = ' '.join(stemmed_tokens)
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text


train_texts, test_texts, train_labels, test_labels = train_test_split(text['text'], text['label'], test_size=0.2, random_state=42)
# Preprocess the training dataset
X_train_preprocessed = [preprocess_text(text) for text in train_texts if len(text) > 5]

# Preprocess the test dataset
X_test_preprocessed = [preprocess_text(text) for text in test_texts if len(text) > 5]

In [7]:
X_train_preprocessed

['said also reserved ticket trump rally phoenix scheduled later day backup plan',
 'full text linking candidate trump trickery',
 'run sharp contrast model created helmut norpoth political science professor stony brook university thats predicting trump running country',
 'warren would loom large fierce presidential primary debate like want move party left believe peel republican moderate even trump supporter centrist message',
 'clear trump know ford testifying thursday',
 'brian segee senior attorney center biological diversity filed initial lawsuit case said group intends appeal disappointing ruling would allow trump shrug crucial environmental law protect people wildlife',
 'believe democrat much lose republican donald trump decides run independent',
 'russian posed american without revealing russian identity communicated unwitting individual associated trump campaign political activist seek coordinate political activity indictment said',
 'tom arnold long claimed tape trump saying 

In [10]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Conv1D, MaxPool1D
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_preprocessed)

# Convert the text to sequences
train_sequences = tokenizer.texts_to_sequences(X_train_preprocessed)
test_sequences = tokenizer.texts_to_sequences(X_test_preprocessed)

# Pad the sequences
maxlen = 100  # maximum sequence length
train_data = pad_sequences(train_sequences, maxlen=maxlen)
test_data = pad_sequences(test_sequences, maxlen=maxlen)

# Define the model
model = Sequential()
model.add(Embedding(5000, 32, input_length=maxlen))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_data, np.array(train_labels), validation_split=0.2, epochs=10, batch_size=32)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_data, np.array(test_labels))
print(f'Test loss: {loss:.2f}, test accuracy: {accuracy:.2f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
1051/5040 [=====>........................] - ETA: 5:01 - loss: 0.6418 - accuracy: 0.6229

KeyboardInterrupt: 