Skip to content

Commit 19d27b4

Browse files
committed
Merge branch 'feat_1.8_kafkaUpdateMode_mergedDev' into 'v1.8.0_dev'
Feat 1.8 kafka update mode merged dev See merge request dt-insight-engine/flinkStreamSQL!12
2 parents cf35435 + 8050cae commit 19d27b4

File tree

24 files changed

+1367
-111
lines changed

24 files changed

+1367
-111
lines changed

README.md

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,11 @@
77
> > * 支持原生FLinkSQL所有的语法
88
> > * 扩展了输入和输出的性能指标到promethus
99
10-
## 新特性:
11-
* 1.kafka源表支持not null语法,支持字符串类型的时间转换。
12-
* 2.rdb维表与DB建立连接时,周期进行连接,防止连接断开。rdbsink写入时,对连接进行检查。
13-
* 3.异步维表支持非等值连接,比如:<>,<,>。
14-
* 4.增加kafka数组解析
15-
* 5.增加kafka1.0以上版本的支持
16-
* 6.增加postgresql、kudu、clickhouse维表、结果表的支持
17-
* 7.支持插件的依赖方式,参考pluginLoadMode参数
18-
* 8.支持cep处理
19-
* 9.支持udaf
20-
* 10.支持谓词下移
21-
* 11.支持状态的ttl
22-
23-
## BUG修复:
24-
* 1.修复不能解析sql中orderby,union语法。
25-
* 2.修复yarnPer模式提交失败的异常。
26-
* 3.一些bug的修复
27-
2810
# 已支持
2911
* 源表:kafka 0.9、0.10、0.11、1.x版本
3012
* 维表:mysql, SQlServer,oracle, hbase, mongo, redis, cassandra, serversocket, kudu, postgresql, clickhouse, impala, db2, sqlserver
3113
* 结果表:mysql, SQlServer, oracle, hbase, elasticsearch5.x, mongo, redis, cassandra, console, kudu, postgresql, clickhouse, impala, db2, sqlserver
3214

33-
# 后续开发计划
34-
* 维表快照
35-
* kafka avro格式
36-
* topN
37-
3815
## 1 快速起步
3916
### 1.1 运行模式
4017

@@ -205,6 +182,7 @@ sh submit.sh -sql D:\sideSql.txt -name xctest -remoteSqlPluginPath /opt/dtstack
205182
* [impala 结果表插件](docs/impalaSink.md)
206183
* [db2 结果表插件](docs/db2Sink.md)
207184
* [sqlserver 结果表插件](docs/sqlserverSink.md)
185+
* [kafka 结果表插件](docs/kafkaSink.md)
208186

209187
### 2.3 维表插件
210188
* [hbase 维表插件](docs/hbaseSide.md)

core/src/main/java/com/dtstack/flink/sql/format/SerializationMetricWrapper.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.flink.metrics.Counter;
2525
import org.apache.flink.metrics.Meter;
2626
import org.apache.flink.metrics.MeterView;
27+
import org.apache.flink.table.runtime.types.CRow;
2728
import org.apache.flink.types.Row;
2829

2930

@@ -34,11 +35,11 @@
3435
* author: toutian
3536
* create: 2019/12/24
3637
*/
37-
public class SerializationMetricWrapper implements SerializationSchema<Row> {
38+
public class SerializationMetricWrapper implements SerializationSchema<CRow> {
3839

3940
private static final long serialVersionUID = 1L;
4041

41-
private SerializationSchema<Row> serializationSchema;
42+
private SerializationSchema<CRow> serializationSchema;
4243

4344
private transient RuntimeContext runtimeContext;
4445

@@ -47,7 +48,7 @@ public class SerializationMetricWrapper implements SerializationSchema<Row> {
4748
protected transient Meter dtNumRecordsOutRate;
4849

4950

50-
public SerializationMetricWrapper(SerializationSchema<Row> serializationSchema) {
51+
public SerializationMetricWrapper(SerializationSchema<CRow> serializationSchema) {
5152
this.serializationSchema = serializationSchema;
5253
}
5354

@@ -57,7 +58,7 @@ public void initMetric() {
5758
}
5859

5960
@Override
60-
public byte[] serialize(Row element) {
61+
public byte[] serialize(CRow element) {
6162
beforeSerialize();
6263
byte[] row = serializationSchema.serialize(element);
6364
afterSerialize();
@@ -79,7 +80,7 @@ public void setRuntimeContext(RuntimeContext runtimeContext) {
7980
this.runtimeContext = runtimeContext;
8081
}
8182

82-
public SerializationSchema<Row> getSerializationSchema() {
83+
public SerializationSchema<CRow> getSerializationSchema() {
8384
return serializationSchema;
8485
}
8586

docs/kafkaSink.md

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
## 1.格式:
2+
```
3+
CREATE TABLE tableName(
4+
colName colType,
5+
...
6+
function(colNameX) AS aliasName,
7+
WATERMARK FOR colName AS withOffset( colName , delayTime )
8+
)WITH(
9+
type ='kafka11',
10+
bootstrapServers ='ip:port,ip:port...',
11+
zookeeperQuorum ='ip:port,ip:port/zkparent',
12+
offsetReset ='latest',
13+
topic ='topicName',
14+
groupId='test',
15+
parallelism ='parllNum',
16+
);
17+
```
18+
19+
## 2.支持的版本
20+
kafka09,kafka10,kafka11及以上版本
21+
**kafka读取和写入的版本必须一致,否则会有兼容性错误。**
22+
23+
## 3.表结构定义
24+
25+
|参数名称|含义|
26+
|----|---|
27+
| tableName | 在 sql 中使用的名称;即注册到flink-table-env上的名称|
28+
| colName | 列名称|
29+
| colType | 列类型 [colType支持的类型](colType.md)|
30+
31+
## 4.参数:
32+
33+
|参数名称|含义|是否必填|默认值|
34+
|----|---|---|---|
35+
|type | kafka09 ||kafka09、kafka10、kafka11、kafka(对应kafka1.0及以上版本)|
36+
|groupId | 需要读取的 groupId 名称|||
37+
|bootstrapServers | kafka bootstrap-server 地址信息(多个用逗号隔开)|||
38+
|zookeeperQuorum | kafka zk地址信息(多个之间用逗号分隔)|||
39+
|topic | 需要读取的 topic 名称|||
40+
|parallelism | 并行度设置||1|
41+
|partitionKeys | 用来分区的字段|||
42+
|updateMode | 回溯流数据下发模式,append,upsert.upsert模式下会将是否为回溯信息以字段形式进行下发。||append|
43+
|sinkdatatype | 写入kafka数据格式,json,avro,csv||json|
44+
|fieldDelimiter | csv数据分隔符|| , |
45+
46+
47+
**kafka相关参数可以自定义,使用kafka.开头即可。**
48+
```
49+
kafka.consumer.id
50+
kafka.socket.timeout.ms
51+
kafka.fetch.message.max.bytes
52+
kafka.num.consumer.fetchers
53+
kafka.auto.commit.enable
54+
kafka.auto.commit.interval.ms
55+
kafka.queued.max.message.chunks
56+
kafka.rebalance.max.retries
57+
kafka.fetch.min.bytes
58+
kafka.fetch.wait.max.ms
59+
kafka.rebalance.backoff.ms
60+
kafka.refresh.leader.backoff.ms
61+
kafka.consumer.timeout.ms
62+
kafka.exclude.internal.topics
63+
kafka.partition.assignment.strategy
64+
kafka.client.id
65+
kafka.zookeeper.session.timeout.ms
66+
kafka.zookeeper.connection.timeout.ms
67+
kafka.zookeeper.sync.time.ms
68+
kafka.offsets.storage
69+
kafka.offsets.channel.backoff.ms
70+
kafka.offsets.channel.socket.timeout.ms
71+
kafka.offsets.commit.max.retries
72+
kafka.dual.commit.enabled
73+
kafka.partition.assignment.strategy
74+
kafka.socket.receive.buffer.bytes
75+
kafka.fetch.min.bytes
76+
```
77+
78+
## 5.样例:
79+
80+
### json格式:
81+
```
82+
CREATE TABLE MyResult(
83+
channel varchar,
84+
pv varchar
85+
)WITH(
86+
type='kafka',
87+
bootstrapServers='172.16.8.107:9092',
88+
topic='mqTest02',
89+
parallelism ='2',
90+
partitionKeys = 'channel,pv',
91+
updateMode='upsert'
92+
);
93+
94+
upsert模式下发的数据格式:{"channel":"zs","pv":"330",retract:true}
95+
append模式下发的数据格式:{"channel":"zs","pv":"330"}
96+
97+
```
98+
99+
### avro格式:
100+
101+
如果updateMode='upsert',schemaInfo需要包含retract属性信息。
102+
103+
```
104+
CREATE TABLE MyTable(
105+
channel varchar,
106+
pv varchar
107+
--xctime bigint
108+
)WITH(
109+
type='kafka',
110+
bootstrapServers='172.16.8.107:9092',
111+
groupId='mqTest01',
112+
offsetReset='latest',
113+
topic='mqTest01',
114+
parallelism ='1',
115+
topicIsPattern ='false'
116+
);
117+
118+
create table sideTable(
119+
channel varchar,
120+
xccount int,
121+
PRIMARY KEY(channel),
122+
PERIOD FOR SYSTEM_TIME
123+
)WITH(
124+
type='mysql',
125+
url='jdbc:mysql://172.16.8.109:3306/test?charset=utf8',
126+
userName='dtstack',
127+
password='abc123',
128+
tableName='sidetest',
129+
cache = 'LRU',
130+
cacheTTLMs='10000',
131+
parallelism ='1'
132+
133+
);
134+
135+
136+
CREATE TABLE MyResult(
137+
channel varchar,
138+
pv varchar
139+
)WITH(
140+
--type='console'
141+
type='kafka',
142+
bootstrapServers='172.16.8.107:9092',
143+
topic='mqTest02',
144+
parallelism ='1',
145+
updateMode='upsert',
146+
sinkdatatype = 'avro',
147+
schemaInfo = '{"type":"record","name":"MyResult","fields":[{"name":"channel","type":"string"}
148+
,{"name":"pv","type":"string"},{"name":"channel","type":"string"},
149+
{"name":"retract","type":"boolean"}]}'
150+
151+
);
152+
153+
154+
insert
155+
into
156+
MyResult
157+
select
158+
a.channel as channel,
159+
a.pv as pv
160+
from
161+
MyTable a
162+
```
163+
### csv格式:
164+
165+
```
166+
CREATE TABLE MyTable(
167+
channel varchar,
168+
pv varchar
169+
--xctime bigint
170+
)WITH(
171+
type='kafka',
172+
bootstrapServers='172.16.8.107:9092',
173+
groupId='mqTest01',
174+
offsetReset='latest',
175+
topic='mqTest01',
176+
parallelism ='2',
177+
topicIsPattern ='false'
178+
);
179+
180+
create table sideTable(
181+
channel varchar,
182+
xccount int,
183+
PRIMARY KEY(channel),
184+
PERIOD FOR SYSTEM_TIME
185+
)WITH(
186+
type='mysql',
187+
url='jdbc:mysql://172.16.8.109:3306/test?charset=utf8',
188+
userName='dtstack',
189+
password='abc123',
190+
tableName='sidetest',
191+
cache = 'LRU',
192+
cacheTTLMs='10000',
193+
parallelism ='1'
194+
195+
);
196+
197+
198+
CREATE TABLE MyResult(
199+
channel varchar,
200+
pv varchar
201+
)WITH(
202+
type='kafka',
203+
bootstrapServers='172.16.8.107:9092',
204+
topic='mqTest02',
205+
parallelism ='2',
206+
updateMode='upsert',
207+
sinkdatatype = 'csv',
208+
fieldDelimiter='*'
209+
210+
211+
212+
);
213+
214+
215+
insert
216+
into
217+
MyResult
218+
select
219+
a.channel as channel,
220+
a.pv as pv
221+
from
222+
MyTable a
223+
```

kafka-base/kafka-base-sink/src/main/java/com/dtstack/flink/sql/sink/kafka/AbstractKafkaProducerFactory.java

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,16 @@
1919

2020
import com.dtstack.flink.sql.format.FormatType;
2121
import com.dtstack.flink.sql.format.SerializationMetricWrapper;
22+
import com.dtstack.flink.sql.sink.kafka.serialization.AvroCRowSerializationSchema;
23+
import com.dtstack.flink.sql.sink.kafka.serialization.CsvCRowSerializationSchema;
24+
import com.dtstack.flink.sql.sink.kafka.serialization.JsonCRowSerializationSchema;
2225
import com.dtstack.flink.sql.sink.kafka.table.KafkaSinkTableInfo;
2326
import org.apache.commons.lang3.StringUtils;
2427
import org.apache.flink.api.common.serialization.SerializationSchema;
2528
import org.apache.flink.api.common.typeinfo.TypeInformation;
26-
import org.apache.flink.formats.avro.AvroRowSerializationSchema;
27-
import org.apache.flink.formats.csv.CsvRowSerializationSchema;
28-
import org.apache.flink.formats.json.JsonRowSerializationSchema;
2929
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
3030
import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner;
31-
import org.apache.flink.types.Row;
31+
import org.apache.flink.table.runtime.types.CRow;
3232

3333
import java.util.Optional;
3434
import java.util.Properties;
@@ -51,42 +51,37 @@ public abstract class AbstractKafkaProducerFactory {
5151
* @param partitioner
5252
* @return
5353
*/
54-
public abstract RichSinkFunction<Row> createKafkaProducer(KafkaSinkTableInfo kafkaSinkTableInfo, TypeInformation<Row> typeInformation, Properties properties, Optional<FlinkKafkaPartitioner<Row>> partitioner, String[] partitionKeys);
54+
public abstract RichSinkFunction<CRow> createKafkaProducer(KafkaSinkTableInfo kafkaSinkTableInfo, TypeInformation<CRow> typeInformation, Properties properties, Optional<FlinkKafkaPartitioner<CRow>> partitioner, String[] partitionKeys);
5555

56-
protected SerializationMetricWrapper createSerializationMetricWrapper(KafkaSinkTableInfo kafkaSinkTableInfo, TypeInformation<Row> typeInformation) {
57-
return new SerializationMetricWrapper(createSerializationSchema(kafkaSinkTableInfo, typeInformation));
56+
protected SerializationMetricWrapper createSerializationMetricWrapper(KafkaSinkTableInfo kafkaSinkTableInfo, TypeInformation<CRow> typeInformation) {
57+
SerializationSchema<CRow> serializationSchema = createSerializationSchema(kafkaSinkTableInfo, typeInformation);
58+
return new SerializationMetricWrapper(serializationSchema);
5859
}
5960

60-
private SerializationSchema<Row> createSerializationSchema(KafkaSinkTableInfo kafkaSinkTableInfo, TypeInformation<Row> typeInformation) {
61-
SerializationSchema<Row> serializationSchema = null;
61+
private SerializationSchema<CRow> createSerializationSchema(KafkaSinkTableInfo kafkaSinkTableInfo, TypeInformation<CRow> typeInformation) {
62+
SerializationSchema<CRow> serializationSchema = null;
6263
if (FormatType.JSON.name().equalsIgnoreCase(kafkaSinkTableInfo.getSinkDataType())) {
63-
6464
if (StringUtils.isNotBlank(kafkaSinkTableInfo.getSchemaString())) {
65-
serializationSchema = new JsonRowSerializationSchema(kafkaSinkTableInfo.getSchemaString());
65+
serializationSchema = new JsonCRowSerializationSchema(kafkaSinkTableInfo.getSchemaString(), kafkaSinkTableInfo.getUpdateMode());
6666
} else if (typeInformation != null && typeInformation.getArity() != 0) {
67-
serializationSchema = new JsonRowSerializationSchema(typeInformation);
67+
serializationSchema = new JsonCRowSerializationSchema(typeInformation, kafkaSinkTableInfo.getUpdateMode());
6868
} else {
6969
throw new IllegalArgumentException("sinkDataType:" + FormatType.JSON.name() + " must set schemaString(JSON Schema)or TypeInformation<Row>");
7070
}
71-
7271
} else if (FormatType.CSV.name().equalsIgnoreCase(kafkaSinkTableInfo.getSinkDataType())) {
73-
7472
if (StringUtils.isBlank(kafkaSinkTableInfo.getFieldDelimiter())) {
7573
throw new IllegalArgumentException("sinkDataType:" + FormatType.CSV.name() + " must set fieldDelimiter");
7674
}
77-
78-
final CsvRowSerializationSchema.Builder serSchemaBuilder = new CsvRowSerializationSchema.Builder(typeInformation);
75+
final CsvCRowSerializationSchema.Builder serSchemaBuilder = new CsvCRowSerializationSchema.Builder(typeInformation);
7976
serSchemaBuilder.setFieldDelimiter(kafkaSinkTableInfo.getFieldDelimiter().toCharArray()[0]);
80-
serializationSchema = serSchemaBuilder.build();
77+
serSchemaBuilder.setUpdateMode(kafkaSinkTableInfo.getUpdateMode());
8178

79+
serializationSchema = serSchemaBuilder.build();
8280
} else if (FormatType.AVRO.name().equalsIgnoreCase(kafkaSinkTableInfo.getSinkDataType())) {
83-
8481
if (StringUtils.isBlank(kafkaSinkTableInfo.getSchemaString())) {
8582
throw new IllegalArgumentException("sinkDataType:" + FormatType.AVRO.name() + " must set schemaString");
8683
}
87-
88-
serializationSchema = new AvroRowSerializationSchema(kafkaSinkTableInfo.getSchemaString());
89-
84+
serializationSchema = new AvroCRowSerializationSchema(kafkaSinkTableInfo.getSchemaString(), kafkaSinkTableInfo.getUpdateMode());
9085
}
9186

9287
if (null == serializationSchema) {

0 commit comments

Comments
 (0)