In [1]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.decomposition import PCA
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# Sample Spark session
spark = SparkSession.builder.appName("LSIExample").getOrCreate()

# Sample documents
data = spark.createDataFrame([
(0, "I love machine learning"),
(1, "Machine learning is fun"),
(2, "Natural language processing is awesome")
], ["id", "text"])

# Tokenize and convert text into features
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Pipeline for preprocessing
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])
model = pipeline.fit(data)
result = model.transform(data)

# Apply PCA for dimensionality reduction (LSI-like)
pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
pca_model = pca.fit(result)
lsi_result = pca_model.transform(result)

lsi_result.select("id", "pcaFeatures").show()

ModuleNotFoundError: No module named 'pyspark.ml.decomposition'

In [2]:
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline

# Sample documents
data = spark.createDataFrame([
(0, "I love machine learning"),
(1, "Machine learning is fun"),
(2, "Natural language processing is awesome")
], ["id", "text"])

# Tokenize the text
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Convert words to term frequency vectors
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)

# LDA model to find topics
lda = LDA(k=2, seed=1, optimizer="em", featuresCol="rawFeatures",
topicDistributionCol="topicDistribution")

# Set up pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, lda])

model = pipeline.fit(data)
result = model.transform(data)

# Show the topics in the result
result.select("id", "topicDistribution").show()

# Get the topics
topics = model.stages[-1].describeTopics(5)
topics.show(truncate=False)

NameError: name 'spark' is not defined

In [3]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml import Pipeline

# Sample labeled data (id, text, label)
data = spark.createDataFrame([

(0, "I love machine learning", 0),
(1, "Machine learning is fun", 0),
(2, "Natural language processing is awesome", 1)
], ["id", "text", "label"])

# Tokenize text and convert to term frequency vectors
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Naive Bayes classifier
nb = NaiveBayes(featuresCol="features", labelCol="label")

# Create pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])
model = pipeline.fit(data)
result = model.transform(data)

# Show predictions
result.select("id", "text", "prediction").show()

NameError: name 'spark' is not defined