In [1]:
# Read dataset
with open("travel_corpus.txt", "r", encoding="utf-8") as f:
    text = f.read()

print("Total characters:", len(text))
print(text[:500])   # preview first 500 characters


Total characters: 451801
i m a sucker for a good travel rewards card over the years i ve collected a small handful of them not because i love juggling annual fees but because the right cards can save you a lot of money on flights hotels and travel perks you d otherwise pay out of pocket for not only do i have a bunch for my personal expenses but i have a couple for this business too one of my favorite business credit cards is the capital one venture x business credit card it s capital one s top tier business product i l


In [2]:
import re

# Extract all words
words = re.findall(r"[A-Za-z']+", text.lower())

print("Total words:", len(words))
print("Sample:", words[:20])


Total words: 82220
Sample: ['i', 'm', 'a', 'sucker', 'for', 'a', 'good', 'travel', 'rewards', 'card', 'over', 'the', 'years', 'i', 've', 'collected', 'a', 'small', 'handful', 'of']


In [3]:
X = []
y = []

for i in range(2, len(words)):
    X.append(words[i-2] + " " + words[i-1])   # context: 2 words
    y.append(words[i])                        # label: next word

print("Total samples:", len(X))
print("Example:", X[0], "→", y[0])


Total samples: 82218
Example: i m → a


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Pipeline: Vectorizer + Logistic Regression model
model = Pipeline([
    ("vec", CountVectorizer(ngram_range=(1,2), min_df=2)),
    ("clf", SGDClassifier(loss="log_loss", max_iter=30, tol=1e-3))
])

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("✅ Model Accuracy:", accuracy)



✅ Model Accuracy: 0.39978107516419364


In [7]:
def predict_next_word(context):
    context = context.lower()
    return model.predict([context])[0]

print("Prediction for 'travel rewards' →", predict_next_word("travel rewards"))
print("Prediction for 'i love' →", predict_next_word("i love"))
print("Prediction for 'in the' →", predict_next_word("in the"))


Prediction for 'travel rewards' → and
Prediction for 'i love' → and
Prediction for 'in the' → us


In [14]:
from collections import Counter

# Keep top 2000 most frequent next words
word_freq = Counter(y)
common_words = set(word for word, _ in word_freq.most_common(2000))

X_filtered = []
y_filtered = []

for x_val, y_val in zip(X, y):
    if y_val in common_words:
        X_filtered.append(x_val)
        y_filtered.append(y_val)

print("Filtered samples:", len(X_filtered))


Filtered samples: 77597


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42
)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    ngram_range=(1,2),
    max_features=3000,   # VERY IMPORTANT
    min_df=3
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Feature shape:", X_train_vec.shape)


Feature shape: (62077, 3000)


In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=50,     # reduce trees
    max_depth=20,       # limit depth
    n_jobs=1,           # IMPORTANT for memory
    random_state=42
)

rf_model.fit(X_train_vec, y_train)
print("✅ Random Forest trained successfully")


✅ Random Forest trained successfully


In [18]:
from sklearn.metrics import accuracy_score

y_pred = rf_model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
print("✅ Random Forest Accuracy:", accuracy)


✅ Random Forest Accuracy: 0.15811855670103092


In [19]:
def predict_next_word_rf(context):
    context = context.lower()
    context_vec = vectorizer.transform([context])
    return rf_model.predict(context_vec)[0]


In [20]:
print("Next word after 'travel rewards' →", predict_next_word_rf("travel rewards"))
print("Next word after 'in the' →", predict_next_word_rf("in the"))
print("Next word after 'i was' →", predict_next_word_rf("i was"))
print("Next word after 'going to' →", predict_next_word_rf("going to"))


Next word after 'travel rewards' → the
Next word after 'in the' → the
Next word after 'i was' → the
Next word after 'going to' → the
