Skip to content

lj-michale/bigdata-pipeline

Repository files navigation

import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.SparkSession; import scala.Tuple2;

import java.util.Arrays; import java.util.List; import java.util.regex.Pattern;

public class WordCount { private static final Pattern SPACE = Pattern.compile(" ");

public static void main(String[] args) throws Exception {

    String path = "E:\\code\\rex\\simple-spark-example\\src\\main\\resources\\README";
    /*if (args.length < 1) {
        System.err.println("Usage: JavaWordCount <file>");
        System.exit(1);
    }*/

    SparkSession spark = SparkSession
            .builder()
            .appName("JavaWordCount")
            .master("local")
            .getOrCreate();

    JavaRDD<String> lines = spark.read().textFile(path).javaRDD();

    JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());

    JavaPairRDD<String, Integer> ones = words.mapToPair(s -> new Tuple2<>(s, 1));

    JavaPairRDD<String, Integer> counts = ones.reduceByKey((i1, i2) -> i1 + i2);

    List<Tuple2<String, Integer>> output = counts.collect();
    System.out.println("========================result out=============================");
    for (Tuple2<?,?> tuple : output) {
        System.out.println(tuple._1() + ": " + tuple._2());
    }
    spark.stop();
}

}

About

description: bigdata-pipeline

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published